Python 官方文档:入门教程 => 点击学习
文章目录 前言一、开始准备1. 包管理和安装chrome驱动2. 爬虫项目的创建(举个栗子)3. setting.py的配置 二、代码演示1. 主爬虫程序2. 中间件的配置3. 定义ite
scrapy和selenium的整合使用
先定个小目标实现万物皆可爬!我们是用scrapy框架来快速爬取页面上的数据,它是自带并发的,速度是可以的。但是一些ajax异步的请求我们不能这么爬取。我们要视同selenium来进行lazy loading,也就是懒加载,渲染到页面加载数据。
首先你要安装以下包:
pip install scrapypip install selenium == 3.0.0pip install pyMysqlpip install bs4
scrapy startproject cnki
scrapy genspider cnki https://www.cnki.net
# 运行不导出(一般在pipelines做导出操作)scrapy crawl cnki# 针对不同的选择可以导出为xlsx、JSON等格式文件scrapy crawl demo -o demo.csv
DB_HOST = 'localhost'DB_PORT = 3306DB_USER = 'root'DB_PASSWord ='123456'DB_DATABASE = 'spider'
LOG_LEVEL = 'WARNING'
Mozilla/5.0 (windows NT 10.0; Win64; x64) AppleWEBKit/537.36 (Khtml, like Gecko) Chrome/107.0.0.0 Safari/537.36
{ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en',}
DOWNLOAD_DELAY = 3RANDOMIZE_DOWNLOAD_DELAY=True
SPIDER_MIDDLEWARES # 蜘蛛中间件DOWNLOADER_MIDDLEWARES # 下载中间件ITEM_PIPELINES # 管道
def __init__(self, *args,**kwargs): option = webdriver.ChromeOptions() # 实例化一个浏览器对象 option.add_argument('--headless') # 添加参数,option可以是headless,--headless,-headless self.driver = webdriver.Chrome(options=option) # 创建一个无头浏览器 # self.driver = webdriver.Chrome() # 创建一个无头浏览器 time.sleep(3) super(CnkiSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.close_driver,signals.spider_closed)
def start_requests(self): for url in self.start_urls: yield scrapy.Request( # 这里可以设置多个页面,一般用于分页的 url=url, )
def close_driver(self): print("爬虫正在退出,执行关闭浏览器哦") time.sleep(2) self.driver.quit()
def parse(self,response: HtmlResponse):sel = Selector(response)dds = sel.CSS('.journal > .main-w1 > dl > dd')for dd in dds:title = dd.css('h6 > a::attr(title)').extract_first() link = dd.css('h6 > a::attr(href)').extract_first() link = response.urljoin(link) author = dd.css('.baseinfo > span > #author::attr(title)').extract_first() abstract = dd.css('.abstract::text').extract_first() count = dd.css('.opts > .opts-count > li > em::text').extract_first() count = int(count) date = dd.css('.opts > .opts-count > .date::text').extract_first() date = date.split(':')[1] date = datetime.datetime.strptime(date,"%Y-%m-%d") rc = Recommend() rc['title'] = title rc['link'] = link rc['author'] = author rc['abstract'] = abstract rc['count'] = count rc['date'] = date yield rc
这里要注意我们yield可以返回不仅是item,也可以是Request,进行页面详情的请求(套娃)
yield Request( url=link, # 这是上面页面上的链接,用来进一步请求 callback=self.parse_detail, # 这是回调函数 cb_kwargs={'item':rc} # 这是把上面的item传递下来 )
class SeleniumDownloaderMiddleware: def process_request(self, request , spider): if spider.name == 'cnki': spider.driver.get(request.url) time.sleep(2) print(f"当前访问{request.url}") spider.driver.refresh() time.sleep(3) return HtmlResponse(url=spider.driver.current_url,body=spider.driver.page_source,encoding='utf-8')
# 我自定义的解析cookie方法def get_cookie_dict(): cookie_str = 填上你的cookie cookie_dict = {} for item in cookie_str.split(';'): key, value = item.split('=',maxsplit=1) cookie_dict[key] = value return cookie_dictCOOKIES_DICT = get_cookie_dict()
# 这是DownloaderMiddleware这是自带的方法哈def process_request(self, request : Request, spider): request.cookies = COOKIES_DICT return None
用来接受爬虫到的数据
class Recommend(scrapy.Item): title = scrapy.Field() author = scrapy.Field() abstract = scrapy.Field() link = scrapy.Field() count = scrapy.Field() date = scrapy.Field()
class RecommendPipeline: @claSSMethod def from_crawler(cls, crawler: Crawler): host = crawler.settings['DB_HOST'] port = crawler.settings['DB_PORT'] username = crawler.settings['DB_USER'] password = crawler.settings['DB_PASSWORD'] database = crawler.settings['DB_DATABASE'] return cls(host, port, username, password, database) def __init__(self, host, port, username, password, database): # 1、与数据库建立连接 self.conn = pymysql.connect(host=host, port=port, user=username, password=password, database=database, charset='utf8mb4') # 2、创建游标 self.cursor = self.conn.cursor() # 3、批处理需要的容器 self.data = [] def process_item(self, item, spider): title = item.get('title', '') author = item.get('author', '') abstract = item.get('abstract', '') link = item.get('link', '') count = item.get('count', '') date = item.get('date', '') # 如果要实现批处理: self.data.append((title,author,abstract,link,count,date)) # 如果存够了10条就进数据库 if len(self.data) == 10: self._to_write_db() # 然后再清空 self.data.clear() return item def close_spider(self, spider): # 如果最后不满足10条 if len(self.data) > 0: self._to_write_db() self.conn.close() def _to_write_db(self): # 作为一个实时的推荐,我希望将查到的数据作为一个temp # 'delete from tb_recommend where 1 = 1' 删除满,并且主键自增不会从1开始 self.cursor.execute( 'truncate table tb_recommend' ) self.cursor.executemany( 'insert into tb_recommend (title,author,abstract,link,count,date) values (%s, %s, %s, %s, %s, %s)', self.data ) self.conn.commit()
记得写入setting.py,设置其权重。
*接下来您就可以按照这种方法‘愉’ ‘快’的进行爬虫啦!!! *
这是scrapy和selenium的具体整合使用,scrapy框架的内容还有很多方法还没用到,都有待开发。其次就是selenium的填充之类的操作还没有使用,还需要去复习selenium的api。
来源地址:https://blog.csdn.net/wnagchenyu/article/details/129793242
--结束END--
本文标题: Scrapy和Selenium整合(一文搞定)
本文链接: https://lsjlt.com/news/386071.html(转载时请注明来源链接)
有问题或投稿请发送至: 邮箱/279061341@qq.com QQ/279061341
2024-03-01
2024-03-01
2024-03-01
2024-02-29
2024-02-29
2024-02-29
2024-02-29
2024-02-29
2024-02-29
2024-02-29
回答
回答
回答
回答
回答
回答
回答
回答
回答
回答
0