package com.ruoyi.web.controller.demo.controller;
import cn.hutool.http.HttpResponse;
import cn.hutool.http.HttpUtil;
import com.alibaba.excel.EasyExcel;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import java.io.BufferedInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Objects;
public class DemoWeiBo
{
/**
* Main function entry, used to grab data from Weibo and store it into Excel.
*
* @param args command line arguments (not used)
* @throws ParseException thrown when an error occurs in date parsing
*/
public static void main(String[] args) throws ParseException {
// Define the URL template for Weibo data crawling
String url = "/ajax/statuses/mymblog?uid=1686546714&feature=0&page=%s";
String unfoldurl = "/ajax/statuses/longtext?id=%s";
String cookie = "Your Cookies";
// Initialize date format
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
//Initialize the export Excel data list
List<ExcelData> excelDataList = new ArrayList<>();
// Crawl the first 2 pages of data in a loop
for (int i = 1; i <= 3; i++) {
try {
// Output prompt information for starting crawling
System.out.println("Start to get the first" + i + "Page Data");
// Format the URL and send an HTTP request to get the response
String urlstr = String.format(url, i);
HttpResponse response = HttpUtil.createGet(urlstr)
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
.header("Cookie",cookie)
.execute();
// parse the response body
String body = response.body();
//(body);
JSONObject jsonObject = JSON.parseObject(body).getJSONObject("data");
JSONArray list = null;
if (Objects.nonNull(jsonObject)) {
// Process data list
list = jsonObject.getJSONArray("list");
// traverse and process each Weibo data
for (Object o : list) {
JSONObject data = (JSONObject) o;
// Analyze and process other information on Weibo
Date created = new Date(data.getString("created_at"));
System.out.println("created:"+dateFormat.format(created));
String regex = "<[^<>]*>";
String text = data.getString("text").replaceAll(regex, "");
String repost = data.getString("reposts_count");
String comment = data.getString("comments_count");
String like = data.getString("attitudes_count");
//Get the image information on Weibo text
StringBuffer pic_url = new StringBuffer();
Long pic_num = data.getLong("pic_num");
if (pic_num > 0 ) {
JSONArray pic_ids = data.getJSONArray("pic_ids");
JSONObject pic_infos = data.getJSONObject("pic_infos");
// traversal pic_ids to get pic_infos child object key
for (Object json : pic_ids) {
String key = (String) json;
JSONObject pic = pic_infos.getJSONObject(key);
JSONObject largest = pic.getJSONObject("largest");
// Extract the image URL and process it
String imageUrl = largest.getString("url");
String filename = imageUrl.substring(imageUrl.lastIndexOf("/") + 1);
// Download the picture
String savePath = "E:\\2024weibo\\" + filename;
downloadPicture(imageUrl, savePath);
pic_url = pic_url.append(savePath).append(",");
}
}
//Get the video information on Weibo text
String video_url = "";
JSONObject page_info = data.getJSONObject("page_info");
if (Objects.nonNull(page_info)) {
JSONObject media_info = page_info.getJSONObject("media_info");
String mp4_hd_url = media_info.getString("mp4_hd_url");
String filename = mp4_hd_url.substring(mp4_hd_url.lastIndexOf("/") + 1, mp4_hd_url.indexOf("?"));
// Download the video
String savePath = "E:\\2024weibo\\" + filename;
downloadPicture(mp4_hd_url, savePath);
video_url = savePath;
}
//There is a situation where when there is too much text content on the page, Weibo does not display all by default, but instead appears the [... Display] button. At this time, you need to request another URL to obtain the expanded text content.
if (text.lastIndexOf("...expand") != -1) {
//Instructions exist Expand You need to re-get text content
String mblogid = data.getString("mblogid");
// Format the URL and send an HTTP request to get the response
String unfoldurlstr = String.format(unfoldurl, mblogid);
HttpResponse response2 = HttpUtil.createGet(unfoldurlstr)
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
.header("Cookie",cookie)
.execute();
// {"ok": 1,"http_code": 200,"data": {}}
String body2 = response2.body();
JSONObject jsonObject2 = JSONObject.parseObject(body2).getJSONObject("data");
String longTextContent = jsonObject2.getString("longTextContent");
System.out.println("longTextContent:"+longTextContent);
// Assign the completed content to text
text = longTextContent;
}
// Create ExcelData object and fill in data
ExcelData excelData = new ExcelData();
//Release time
excelData.setDate(created);
// Likes
excelData.setLike(Long.parseLong(like));
//Comments
excelData.setComment(Long.parseLong(comment));
//Repost count
excelData.setRepost(Long.parseLong(repost));
//Original content
excelData.setContent(text);
//Picture address
excelData.setImgUrl(pic_url.toString());
//Video address
excelData.setVideoUrl(video_url);
excelDataList.add(excelData);
}
}
// Output the completion prompt and turn off the response, sleep to avoid frequent requests
System.out.println("Third" + i + "Page data acquisition has been completed");
response.close();
// If the list is empty, terminate the loop
if (list == null || list.size() == 0) {
break;
}
Thread.sleep(700);
} catch (Exception e) {
// Print exception information
e.printStackTrace();
}
}
// Prompt for starting to write to Excel
System.out.println("Excel write data starts");
// Function calls to Excel
EasyExcel.write("E:/Weibo.xlsx", ExcelData.class)
.sheet("Sheet1")
.doWrite(excelDataList);
System.out.println("Excel write data ends");
}
/**
* Download the image to the specified path
*
* @param imageUrl The URL address of the image
* @param savePath Local path to save the image
*/
public static void downloadPicture(String imageUrl, String savePath){
BufferedInputStream in = null;
FileOutputStream out = null;
HttpURLConnection connection = null;
try {
// Create URL object and open connection
URL url = new URL(imageUrl);
connection = (HttpURLConnection) url.openConnection();
// Set the request method to GET
connection.setRequestMethod("GET");
// Create a connection
connection.connect();
// Get the response code and determine whether the download is successful
int responseCode = connection.getResponseCode();
if (responseCode == HttpURLConnection.HTTP_OK) {
// Create input and output streams for reading and saving pictures
in = new BufferedInputStream(connection.getInputStream());
out = new FileOutputStream(savePath);
// Buffer, used to read and write a certain amount of data at once
byte[] buffer = new byte[1024];
int bytesRead;
// Loop until there is no data
while ((bytesRead = in.read(buffer)) != -1) {
out.write(buffer, 0, bytesRead);
}
System.out.println("Picture/Video downloaded successfully, save the path:" + savePath);
} else {
// The response code is not HTTP_OK, download failed
System.out.println("Cannot download pictures/videos, response code:" + responseCode);
}
}catch (Exception e) {
//Catch exceptions and print stack information
e.printStackTrace();
}finally {
// Close streams and connections at the end regardless of success or failure
// Close the input stream
if (in != null) {
try {
in.close();
} catch (IOException e) {
// Turn IO exception into runtime exception thrown
throw new RuntimeException(e);
}
}
// Turn off the output stream
if (out != null) {
try {
out.close();
} catch (IOException e) {
// Turn IO exception into runtime exception thrown
throw new RuntimeException(e);
}
}
// Close the connection
if (connection != null) {
connection.disconnect();
}
}
}
}