Python 官方文档:入门教程 => 点击学习
准备环境一般来说在线看漫画的网站都会使用JavaScript来返回页面,打开百度搜索在线漫画,如下图:目标网站: http://www.1kkk.com 极速漫画,选取一个漫画爬取 http://www.1kkk.com/manh
准备环境
一般来说在线看漫画的网站都会使用JavaScript来返回页面,打开百度搜索在线漫画,如下图:
目标网站: http://www.1kkk.com 极速漫画,选取一个漫画爬取 http://www.1kkk.com/manhua1963/
目标分析:
爬虫代码:
创建一个GetComic.py文件,代码如下:
from selenium import WEBdriver
from mylog import MyLog as mylog
import os
import time
class GetComic(object):
def __init__(self):
self.startUrl = 'Http://www.1kkk.com/ch223-85528/'
self.log = mylog()
self.browser = self.getBrowser()
self.saveCartoon(self.browser)
def getBrowser(self):
try:
browser = webdriver.Phantomjs()
browser.get(self.startUrl)
browser.implicitly_wait(20)
except Exception as e:
self.log.error('open the %s failed : %s' % (self.startUrl, e))
else:
return browser
def saveCartoon(self, browser):
cartoonTitle = browser.title.split('_')[0]
self.createDir(cartoonTitle)
os.chdir(cartoonTitle)
sumPage = int(browser.find_element_by_xpath("//div[@class='chapterpager']/a[last()]").text)
i = 1
while i <= sumPage:
imgName = str(i) + '.png'
browser.get_screenshot_as_file(imgName)
self.log.info('save img %s' % imgName)
i += 1
NextTag = browser.find_element_by_partial_link_text("下一页").click()
browser.implicitly_wait(30)
time.sleep(10)
self.log.info('save img success')
exit()
def createDir(self, dirName):
if os.path.exists(dirName):
self.log.error('create directory %s failed, have a sane name file or directory' % dirName)
else:
try:
os.makedirs(dirName)
except Exception as e:
self.log.error('create directory %s failed : %s' % (dirName, e))
else:
self.log.info('create directory %s success' % dirName)
if __name__ == '__main__':
ST = GetComic()
mylog.py日志文件代码如下:
import logging
import getpass
import sys
# 定义MyLog类
class MyLog(object):
def __init__(self):
self.user = getpass.getuser() # 获取用户
self.logger = logging.getLogger(self.user)
self.logger.setLevel(logging.DEBUG)
# 日志文件名
self.logfile = sys.argv[0][0:-3] + '.log' # 动态获取调用文件的名字
self.fORMatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(message)-12s\r\n')
# 日志显示到屏幕上并输出到日志文件内
self.logHand = logging.FileHandler(self.logfile, encoding='utf-8')
self.logHand.setFormatter(self.formatter)
self.logHand.setLevel(logging.DEBUG)
self.logHandSt = logging.StreamHandler()
self.logHandSt.setFormatter(self.formatter)
self.logHandSt.setLevel(logging.DEBUG)
self.logger.addHandler(self.logHand)
self.logger.addHandler(self.logHandSt)
# 日志的5个级别对应以下的5个函数
def debug(self, msg):
self.logger.debug(msg)
def info(self, msg):
self.logger.info(msg)
def warn(self, msg):
self.logger.warn(msg)
def error(self, msg):
self.logger.error(msg)
def critical(self, msg):
self.logger.critical(msg)
if __name__ == '__main__':
mylog = MyLog()
mylog.debug(u"I'm debug 中文测试")
mylog.info(u"I'm info 中文测试")
mylog.warn(u"I'm warn 中文测试")
mylog.error(u"I'm error 中文测试")
mylog.critical(u"I'm critical 中文测试")
运行主程序GetComic.py
Pycharm运行结果
生成目录和下载图片
--结束END--
本文标题: Selenium&PhantomJS实战二:爬取漫画
本文链接: https://lsjlt.com/news/179165.html(转载时请注明来源链接)
有问题或投稿请发送至: 邮箱/279061341@qq.com QQ/279061341
2024-03-01
2024-03-01
2024-03-01
2024-02-29
2024-02-29
2024-02-29
2024-02-29
2024-02-29
2024-02-29
2024-02-29
回答
回答
回答
回答
回答
回答
回答
回答
回答
回答
0