Scrapy? WeChat ?? ?? ??? ??? ? ??? ?????.
Jun 22, 2023 am 09:41 AMScrapy? WeChat ?? ??? ?? ??? ? ??? ?????
WeChat? ?? ? ?? ?? ?? ?? ?? ??? ????????, ??? ???? ?? ??? ?? ??? ??? ???. ?? ?? ?? ??? WeChat ?? ??? ??? ??? ?????. ???? ? ?? ??? ??, ??? ??? ? ?? ??? ??? ? ?? ?????. ? ??? ????, ???? ? ??? ???? ??? ??? ? ????.
??? ?? ???? Scrapy ?????? ???? WeChat ?? ?? ?? ????? ???? ??? ???????. Scrapy? ??? ???? ?? ??? ?? ???? ?? Python ? ??? ????????. ??? Scrapy? ??? ??? ???? ??????.
- Scrapy ?? ? ???? ??
????? Scrapy ?????? ????? ?? Scrapy ? ?? ?? ??? ???? ???. pip ??? ???? ??? ? ????. ?? ????? ??? ????.
pip install scrapy pip install pymongo pip install mysql-connector-python
Scrapy? ??? ? Scrapy ??? ??? ???? ????? ???? ???. ??? ??? ????.
scrapy startproject wechat
? ??? ???? Scrapy? "wechat"??? ????? ???? ???? ????? ?? ??? ????? ?????.
- WeChat ?? ?? ?? ??? ??
???? ???? ?? ?? WeChat ?? ?? ?? ???? URL ??? ???? ???. ???? WeChat ?? ?? ?? ???? URL? ??? ????.
https://mp.weixin.qq.com/s?__biz=XXX&mid=XXX&idx=1&sn=XXX&chksm=XXX#wechat_redirect
? ? __biz? WeChat ?? ??? ID? ????, mid? ??? ID? ????, idx? ??? ?? ??, sn? ??? ???? chksm? ?? ??? ?????. ??? ?? ?? ??? ?? ??? ?????? ?? ?? ??? ID? ?? ?? ???? URL? ???? ???. ? ? biz_id? ?? ??? ?? ?????.
?? ?? ??? ??? ????? ?? ??? ?? ?? ID? ?? ??? ??? ???? ???. ID ??? ??? ??? ?? ???? ? ????. ???? ?? ??? ID? ??? ??? ?? ?????.
biz_ids = ['MzU5MjcwMzA4MA==', 'MzI4MzMwNDgwMQ==', 'MzAxMTcyMzg2MA==']
???? ?? ?? ??? ?? ??? ????? Spider? ???? ???. ???? ??? ?? ?? ID? ??? ? ??? ?? ??? ??? ID? Spider? ?????.
import scrapy import re class WeChatSpider(scrapy.Spider): name = "wechat" allowed_domains = ["mp.weixin.qq.com"] def __init__(self, name=None, biz_id=None): super().__init__(name=name) self.start_urls = ['https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz={}==#wechat_redirect'.format(biz_id)] def parse(self, response): article_urls = response.xpath('//h4[1]/a/@href') for url in article_urls.extract(): yield scrapy.Request(url, callback=self.parse_article) next_page = response.xpath('//a[@id="js_next"]/@href') if next_page: yield scrapy.Request(response.urljoin(next_page[0].extract()), callback=self.parse) def parse_article(self, response): url = response.url title = response.xpath('//h2[@class="rich_media_title"]/text()') yield {'url': url, 'title': title.extract_first().strip()}
Spider? ?? ??? ??? ?? ?? ID? ???? ?? ?? ????? ???? ? ? ???? ????? ???? ?? ??? URL? ???? ????. ??, ?? ??? ?? ??? URL? ??? ???? ??? pars_article ???? ?????. ????? ? ??? ??? ???? ??? ?? ??? ????.
????? Spider? ????? ???? ?? ??? ???? ???.
scrapy crawl wechat -a biz_id=XXXXXXXX
????? ?? ?? ??? ???? ?? ????. ??? ?? ?? ??? ID? ???? ???.
scrapy crawl wechat -a biz_id=ID1,ID2,ID3
- ?? ??? ??
??? ???? ? ??? ??? URL? ??????(?: MongoDB, MySQL ?)? ???? ???. ???? pymongo ?????? ???? ???? ???? ???????.
import pymongo class MongoPipeline(object): collection_name = 'wechat' def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DATABASE', 'items') ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def close_spider(self, spider): self.client.close() def process_item(self, item, spider): self.db[self.collection_name].insert_one(dict(item)) return item
? ???????? MongoDB? ??? ??? ?? ???? ?????. ? ???? ??? ?? ???? ?? ?????? ???? ??? ? ????.
???? settings.py ???? ?????? ?? ????? ???? ???.
MONGO_URI = 'mongodb://localhost:27017/' MONGO_DATABASE = 'wechat' ITEM_PIPELINES = {'myproject.pipelines.MongoPipeline': 300}
????? Spider?? Pipeline? ???? MongoDB? ???? ?????.
class WeChatSpider(scrapy.Spider): name = "wechat" allowed_domains = ["mp.weixin.qq.com"] def __init__(self, name=None, biz_id=None): super().__init__(name=name) self.start_urls = ['https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz={}==#wechat_redirect'.format(biz_id)] def parse(self, response): article_urls = response.xpath('//h4[1]/a/@href') for url in article_urls.extract(): yield scrapy.Request(url, callback=self.parse_article) next_page = response.xpath('//a[@id="js_next"]/@href') if next_page: yield scrapy.Request(response.urljoin(next_page[0].extract()), callback=self.parse) def parse_article(self, response): url = response.url title = response.xpath('//h2[@class="rich_media_title"]/text()') yield {'url': url, 'title': title.extract_first().strip()} pipeline = response.meta.get('pipeline') if pipeline: item = dict() item['url'] = url item['title'] = title.extract_first().strip() yield item
? ???? response.meta.get(' ?????')? Spider?? ??? ????? ??? ?? ? ?????. ??? ?????? ????? Spider ??? ?? ??? ????? ?? ???.
yield scrapy.Request(url, callback=self.parse_article, meta={'pipeline': 1})
- Data analyze
????? Scrapy ? pandas? ?? ?????? ???? ???? ???? ??????.
????? MongoDB?? ???? ???? ???? CSV ??? ???????. ?? ?? ??? ???? CSV ??? ???? ???? ? ????.
?? ??? ??? ????.
import pandas as pd from pymongo import MongoClient client = MongoClient('mongodb://localhost:27017/') db = client['wechat'] articles = db['wechat'] cursor = articles.find() doc = list(cursor) df = pd.DataFrame(doc) df.to_csv('wechat.csv', encoding='utf-8') df.groupby('biz_id')['title'].count().plot(kind='bar')
? ????? MongoDB ? Pandas ?????? ???? ???? ???? CSV ??? ??? ??? ?????. ?? Pandas? ??? ??? ?? ??? ???? ? ?? ??? ?? ?? ????? ??????.
? ??? Scrapy? WeChat ?? ?? ??? ??? ? ??? ?????.? ?? ?????. ??? ??? PHP ??? ????? ?? ?? ??? ?????!

? AI ??

Undress AI Tool
??? ???? ??

Undresser.AI Undress
???? ?? ??? ??? ?? AI ?? ?

AI Clothes Remover
???? ?? ???? ??? AI ?????.

Clothoff.io
AI ? ???

Video Face Swap
??? ??? AI ?? ?? ??? ???? ?? ???? ??? ?? ????!

?? ??

??? ??

???++7.3.1
???? ?? ?? ?? ???

SublimeText3 ??? ??
??? ??, ???? ?? ????.

???? 13.0.1 ???
??? PHP ?? ?? ??

???? CS6
??? ? ?? ??

SublimeText3 Mac ??
? ??? ?? ?? ?????(SublimeText3)

Scrapy? WeChat ?? ??? ?? ??? ? ??? ?????. WeChat? ?? ? ? ?? ?? ?? ?? ??? ????????, ??? ???? ?? ??? ?? ??? ??? ???. ?? ?? ?? ??? WeChat ?? ??? ??? ??? ?????. ???? ? ?? ??? ??, ??? ??? ? ?? ??? ??? ? ?? ?????. ? ??? ????, ???? ? ??? ???? ??? ??? ? ????. ??? ? ???? Scrapy ?????? ???? WeChat ?? ?? ?? ????? ???? ??? ???????. Scr

WeChat ?? ?? ??? ???? ???? ?? ??, ?? ??, ?? ??, ????? ?? ? ??? ??? ????. ??? ??: 1. ?? ?? ??? ?? ??? ??? V ??? ?? ?? ??? ?? ???. ? ??? ?? ??? ???? ??? ??? ???? ?? ?? ?? ??? ? ?? ??? ? ?? ????. 2. ?? ??. ??? ?? ??? ??? ?? ???? ? ?? ??? ??? ????. ?? ??, ??? ?? ??? ??? ?? ? ?? ?? ?? ?? WeChat ?? ??? ???? ? ????.

Scrapy? ?????? ???? ??? ????? ?? ? ?? ?? ?? Python ??? ????????. ??? ?? ????? Ajax ??? ?? ??? ????? Scrapy? ???? ?? ?? ?? ??????. ? ????? Ajax ??? ??? ???? ? Scrapy ?? ??? ?????. 1. Ajax ??? ?? ?? Ajax ??? ??: ???? ??? ?? ????? ????? ??? ??? ?? ? ??? ??? ??? ??? ???? ?? ??? ???? ?? ?? ???? ???? ???.

Scrapy? ????? ?? ??? ??? ?? ?? ? ?? Python ?? ??? ????????. ? ????? Scrapy ??? ???? LinkedIn?? ?? ??? ????? ??? ??? ?????. ?? URL ?? ?? ??? LinkedIn? ?? ???? ??? ?? ???. ??? LinkedIn ?? ?? ???? URL? ??? ???. LinkedIn ????? ?? ???? ?? ??? ??? ?

Scrapy? ????? ??? ???? ?? ? ??? ? ?? ??? Python ??? ????????. ??? Scrapy? ??? ? ??? URL? ????? ??? ?? ???? ???, ?? ?? ??? ??? ???? ???? ??? ????. ? ????? ?? URL? ???? ??? Scrapy ???? ???? ????? ? ?? Scrapy ??? ??? ?????. 1. Scrapy ???? start_urls ? allowed_domains ??? ????

Scrapy? ????? ??? ???? ???? ?? ? ??? ?? ??? Python ??? ????????. ?? ??? ???? HTML, XML, JSON ? ??? ??? ??? ??? ??? ????. ? ????? Scrapy? ???? ? ?? ??? ??? ?? ????? ??? ?????. 1. HTML ???? ????? Scrapy ????? ?????. ?? Scrapy ????? ???? ???. ???? ?? ?? ??? ?????: scrapys

Scrapy ????? Selenium ? PhantomJSScrapy ?? Scrapy? Python ??? ??? ? ??? ??????? ??? ??? ??? ?? ? ??? ?? ???????. ??? ?? ? ?? ?????? ???? ???? ?? ?? ???? ??? ??????? ?? ??? ????. ? ?? Selenium ? PhantomJS? ?????. Selenium? ?????? ?? ??? ??????? ? ?????? ???? ???? ? ????.

???? ??? ???? ??? ?? ?? ???? ?? ? ???? ????. ?? ???? ????? Douban Books? ???? ? ? ???? ?????. ??, Douban Books? ??? ?? ??? ??? ???? ???? ?? ?? ????? ??? ? ??? ????. ??? ? ??? ???? ?? ?? ?? ???? ??? ?? ?? ????. ?? Scrapy ??? ???? ???? ???? ? ????. Scrapy? Python ??? ?? ?? ? ??? ?????? ????? ??? ? ? ????.
