web123456

2024 brand new Java-based microblog data crawling (full version)

package com.ruoyi.web.controller.demo.controller; import cn.hutool.http.HttpResponse; import cn.hutool.http.HttpUtil; import com.alibaba.excel.EasyExcel; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; import java.io.BufferedInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.net.HttpURLConnection; import java.net.URL; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Objects; public class DemoWeiBo { /** * Main function entry, used to grab data from Weibo and store it into Excel. * * @param args command line arguments (not used) * @throws ParseException thrown when an error occurs in date parsing */ public static void main(String[] args) throws ParseException { // Define the URL template for Weibo data crawling String url = "/ajax/statuses/mymblog?uid=1686546714&feature=0&page=%s"; String unfoldurl = "/ajax/statuses/longtext?id=%s"; String cookie = "Your Cookies"; // Initialize date format SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); //Initialize the export Excel data list List<ExcelData> excelDataList = new ArrayList<>(); // Crawl the first 2 pages of data in a loop for (int i = 1; i <= 3; i++) { try { // Output prompt information for starting crawling System.out.println("Start to get the first" + i + "Page Data"); // Format the URL and send an HTTP request to get the response String urlstr = String.format(url, i); HttpResponse response = HttpUtil.createGet(urlstr) .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36") .header("Cookie",cookie) .execute(); // parse the response body String body = response.body(); //(body); JSONObject jsonObject = JSON.parseObject(body).getJSONObject("data"); JSONArray list = null; if (Objects.nonNull(jsonObject)) { // Process data list list = jsonObject.getJSONArray("list"); // traverse and process each Weibo data for (Object o : list) { JSONObject data = (JSONObject) o; // Analyze and process other information on Weibo Date created = new Date(data.getString("created_at")); System.out.println("created:"+dateFormat.format(created)); String regex = "<[^<>]*>"; String text = data.getString("text").replaceAll(regex, ""); String repost = data.getString("reposts_count"); String comment = data.getString("comments_count"); String like = data.getString("attitudes_count"); //Get the image information on Weibo text StringBuffer pic_url = new StringBuffer(); Long pic_num = data.getLong("pic_num"); if (pic_num > 0 ) { JSONArray pic_ids = data.getJSONArray("pic_ids"); JSONObject pic_infos = data.getJSONObject("pic_infos"); // traversal pic_ids to get pic_infos child object key for (Object json : pic_ids) { String key = (String) json; JSONObject pic = pic_infos.getJSONObject(key); JSONObject largest = pic.getJSONObject("largest"); // Extract the image URL and process it String imageUrl = largest.getString("url"); String filename = imageUrl.substring(imageUrl.lastIndexOf("/") + 1); // Download the picture String savePath = "E:\\2024weibo\\" + filename; downloadPicture(imageUrl, savePath); pic_url = pic_url.append(savePath).append(","); } } //Get the video information on Weibo text String video_url = ""; JSONObject page_info = data.getJSONObject("page_info"); if (Objects.nonNull(page_info)) { JSONObject media_info = page_info.getJSONObject("media_info"); String mp4_hd_url = media_info.getString("mp4_hd_url"); String filename = mp4_hd_url.substring(mp4_hd_url.lastIndexOf("/") + 1, mp4_hd_url.indexOf("?")); // Download the video String savePath = "E:\\2024weibo\\" + filename; downloadPicture(mp4_hd_url, savePath); video_url = savePath; } //There is a situation where when there is too much text content on the page, Weibo does not display all by default, but instead appears the [... Display] button. At this time, you need to request another URL to obtain the expanded text content. if (text.lastIndexOf("...expand") != -1) { //Instructions exist Expand You need to re-get text content String mblogid = data.getString("mblogid"); // Format the URL and send an HTTP request to get the response String unfoldurlstr = String.format(unfoldurl, mblogid); HttpResponse response2 = HttpUtil.createGet(unfoldurlstr) .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36") .header("Cookie",cookie) .execute(); // {"ok": 1,"http_code": 200,"data": {}} String body2 = response2.body(); JSONObject jsonObject2 = JSONObject.parseObject(body2).getJSONObject("data"); String longTextContent = jsonObject2.getString("longTextContent"); System.out.println("longTextContent:"+longTextContent); // Assign the completed content to text text = longTextContent; } // Create ExcelData object and fill in data ExcelData excelData = new ExcelData(); //Release time excelData.setDate(created); // Likes excelData.setLike(Long.parseLong(like)); //Comments excelData.setComment(Long.parseLong(comment)); //Repost count excelData.setRepost(Long.parseLong(repost)); //Original content excelData.setContent(text); //Picture address excelData.setImgUrl(pic_url.toString()); //Video address excelData.setVideoUrl(video_url); excelDataList.add(excelData); } } // Output the completion prompt and turn off the response, sleep to avoid frequent requests System.out.println("Third" + i + "Page data acquisition has been completed"); response.close(); // If the list is empty, terminate the loop if (list == null || list.size() == 0) { break; } Thread.sleep(700); } catch (Exception e) { // Print exception information e.printStackTrace(); } } // Prompt for starting to write to Excel System.out.println("Excel write data starts"); // Function calls to Excel EasyExcel.write("E:/Weibo.xlsx", ExcelData.class) .sheet("Sheet1") .doWrite(excelDataList); System.out.println("Excel write data ends"); } /** * Download the image to the specified path * * @param imageUrl The URL address of the image * @param savePath Local path to save the image */ public static void downloadPicture(String imageUrl, String savePath){ BufferedInputStream in = null; FileOutputStream out = null; HttpURLConnection connection = null; try { // Create URL object and open connection URL url = new URL(imageUrl); connection = (HttpURLConnection) url.openConnection(); // Set the request method to GET connection.setRequestMethod("GET"); // Create a connection connection.connect(); // Get the response code and determine whether the download is successful int responseCode = connection.getResponseCode(); if (responseCode == HttpURLConnection.HTTP_OK) { // Create input and output streams for reading and saving pictures in = new BufferedInputStream(connection.getInputStream()); out = new FileOutputStream(savePath); // Buffer, used to read and write a certain amount of data at once byte[] buffer = new byte[1024]; int bytesRead; // Loop until there is no data while ((bytesRead = in.read(buffer)) != -1) { out.write(buffer, 0, bytesRead); } System.out.println("Picture/Video downloaded successfully, save the path:" + savePath); } else { // The response code is not HTTP_OK, download failed System.out.println("Cannot download pictures/videos, response code:" + responseCode); } }catch (Exception e) { //Catch exceptions and print stack information e.printStackTrace(); }finally { // Close streams and connections at the end regardless of success or failure // Close the input stream if (in != null) { try { in.close(); } catch (IOException e) { // Turn IO exception into runtime exception thrown throw new RuntimeException(e); } } // Turn off the output stream if (out != null) { try { out.close(); } catch (IOException e) { // Turn IO exception into runtime exception thrown throw new RuntimeException(e); } } // Close the connection if (connection != null) { connection.disconnect(); } } } }