web123456

Crawl CCTV hot list and store it in MongoDB

  • import re
  • import pymongo
  • import requests
  • headers = {
  • # Request tool identification
  • "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (HTML, like Gecko) "
  • "Chrome/127.0.0.0 Safari/537.36"
  • }
  • url = '/top/?spm=C28340.PdNvWY0LYxCP.EtmP5mypaGE4.11'
  • res = (url, headers=headers)
  • con = ("utf8")
  • datas = (r'<ul>.*?</ul>', con, )
  • result = {
  • "Hot List": {
  • "name": "Hot List",
  • "items": []
  • },
  • "cartoon": {
  • "name": "cartoon",
  • "items": []
  • },
  • "TV drama": {
  • "name": "TV drama",
  • "items": []
  • },
  • "Documentary": {
  • "name": "Documentary",
  • "items": []
  • },
  • "Special Program": {
  • "name": "Special Program",
  • "items": []
  • }
  • }
  • # print(datas[1])
  • items = (
  • r'<li.*?lazy="(.*?)".*?<div class="text"><a href=".*?" target="_blank">(.*?)</a>'
  • r'</div>.*?<div class="column"><i class="icon_l"></i><a href=".*?" target="_blank">(.*?)</a>'
  • r'<i class="icon_r"></i></div>.*?</li>',
  • datas[1], )
  • for item in items:
  • # print(item)
  • result["Hot List"]["items"].append({
  • "img": item[0],
  • "title": item[1],
  • "category": item[2]
  • })
  • # pass
  • # print(datas[2])
  • items = (
  • r'<li.*?lazy="(.*?)" width="188" height="250"></a>.*?<span class="number"><i class="icon_l">'
  • r'</i><i class="txt">(.*?)</i><i class="icon_r"></i></span>.*?</div>.*?'
  • r'<a class="cover" href=".*?" target="_blank">.*?</a>.*?<div class="text">'
  • r'<a href=".*?" target="_blank">(.*?)</a></div>.*?<p><a href=".*?" target="_blank">(.*?)</a></p>.*?</li>'
  • , datas[2],
  • )
  • for item in items:
  • # print(item)
  • result["cartoon"]["items"].append({
  • "img": item[0],
  • "title": item[2],
  • "category": item[1],
  • "synopsis": item[3]
  • })
  • # pass
  • # print(datas[3])
  • items = (
  • r'<li.*?lazy="(.*?)" width="188" height="250"></a>.*?<span class="number">'
  • r'<i class="icon_l"></i><i class="txt">(.*?)</i><i class="icon_r"></i></span>.*?</div>.*?'
  • r'<a class="cover" href=".*?" target="_blank">.*?</a>.*?<div class="text"><a href=".*?" target="_blank">(.*?)</a>'
  • r'</div>.*?<p><a href=".*?" target="_blank">(.*?)</a></p>.*?</li>'
  • , datas[3],
  • )
  • for item in items:
  • # print(item)
  • result["TV drama"]["items"].append({
  • "img": item[0],
  • "title": item[2],
  • "episode": item[1],
  • "synopsis": item[3]
  • })
  • # pass
  • # print(datas[4])
  • items = (
  • r'<li.*?lazy="(.*?)" width="188" height="250"></a>.*?<span class="number">'
  • r'<i class="icon_l"></i><i class="txt">(.*?)</i><i class="icon_r"></i>'
  • r'</span>.*?</div>.*?<a class="cover" href=".*?" target="_blank">.*?</a>.*?<div class="text">'
  • r'<a href=".*?" target="_blank">(.*?)</a></div>.*?<p><a href=".*?" target="_blank">(.*?)</a></p>.*?</li>'
  • , datas[4],
  • )
  • for item in items:
  • # print(item)
  • result["Documentary"]["items"].append({
  • "img": item[0],
  • "title": item[2],
  • "category": item[1],
  • "synopsis": item[3]
  • })
  • # pass
  • # print(datas[5])
  • items = (
  • r'<li.*?lazy="(.*?)" width="188" height="250"></a>.*?<span class="number">'
  • r'<i class="icon_l"></i><i class="txt">(.*?)</i><i class="icon_r"></i></span>.*?</div>.*?'
  • r'<a class="cover" href=".*?" target="_blank">.*?</a>.*?<div class="text"><a href=".*?" target="_blank">(.*?)</a>'
  • r'</div>.*?<p><a href=".*?" target="_blank">(.*?)</a></p>.*?</li>'
  • , datas[5],
  • )
  • for item in items:
  • # print(item)
  • result["Special Program"]["items"].append({
  • "img": item[0],
  • "title": item[2],
  • "tv": item[1],
  • "synopsis": item[3]
  • })
  • # pass
  • # print(result)
  • client = ()
  • db = client.get_default_database("cctv")
  • collection = db.get_collection("top")
  • collection.insert_one(result)
  • ()