web123456

【Web Crawler】HttpClient crawl+parse+storage data

package ; import ; import ; import ; import ; import ; import ; import ; import ; import ; import ; import ; import ; /** * NetEase Loan Crawl Manager * @author tsj-pc * */ public class WangYiDaiCrawlManager { public static HttpClientCrawlerImpl httpClientCrawlerImpl = new HttpClientCrawlerImpl(); public static String[] column_key = { "platName", "locationAreaName", "locationCityName", "platUrl" }; private static CrawlResultPojo crawlOnePage(UrlPojo urlPojo) { CrawlResultPojo resultPojo = httpClientCrawlerImpl.crawl4Post(urlPojo); return resultPojo; } public static int item_count = 0; public static String parserOnePage(String jsonStr) { // parse the json JSONObject jsonObj = (jsonStr); JSONArray jsonArray = (("list") .toString()); StringBuilder stringBuilder = new StringBuilder(); for (Object json : jsonArray) { JSONObject itemJson = (JSONObject) json; for (String column : column_key) { ((column) + "\t"); } ("\n"); item_count++; } return (); } public static void processWangYiDai(String url, int max_page_number, String filePath) { //Storage all crawling entries StringBuilder all_items = new StringBuilder(); UrlPojo urlPojo = new UrlPojo(url); Map<String, Object> parasMap = new HashMap<String, Object>(); int have_download_page_count = 0; Set<String> uniqSet = new HashSet<String>(); for (int pageNumber = 1; pageNumber <= max_page_number; pageNumber++) { ("currPage", pageNumber); ("params", ""); ("sort", 0); (parasMap); CrawlResultPojo resultPojo = crawlOnePage(urlPojo); if ((())) { ("If you encounter duplication, it means that the crawling has been completed!"); break; } else { (()); } if (resultPojo != null) { String content = (); String page_items = parserOnePage(content); all_items.append(page_items); have_download_page_count++; } } ("all items size---" + item_count); ("Already downloaded ---" + have_download_page_count); (filePath, all_items.toString(), "utf-8"); ("save successfully~"); } public static void main(String[] args) { String url = "/front_select-plat"; int max_page_number = 100; String fileName = "NetEase Loan_Dataset.txt"; processWangYiDai(url, max_page_number, fileName); ("done!"); } }