一、requests库基础知识Requests的方法 requests库的response对象二、爬取网站所需信息访问网站,如图1-1所示:图1-1点击子页面,审查网页元素,部分内容如图1-2所示:图1-2实现代码如下
一、requests库基础知识
Requests的方法
requests库的response对象
二、爬取网站所需信息
访问网站,如图1-1所示:
图1-1
点击子页面,审查网页元素,部分内容如图1-2所示:
图1-2
实现代码如下:
#coding:utf-8import requestsfrom bs4 import BeautifulSoupimport xlsxwriter#定义网页内容获取函数GET_html_CONTENTdef GET_HTML_CONTENT(url): #定义user_agent,模拟浏览器访问网页 user_agent = 'Mozilla/5.0 (windows NT 6.1; WOW64) AppleWEBKit/537.36 (KHTML, like Gecko) \ Chrome/63.0.3239.132 Safari/537.36' headers = {'User-Agent':user_agent} r = requests.get(url,headers=headers) #获取网页内容 html_str = r.text return html_str #定义子网页URL获取函数GET_CHILD_URL def GET_CHILD_URL(content): data = BeautifulSoup(content, "html.parser") genre_session = data.find_all('li', attrs={'class': "medium listbox group"}) #定义一个空列表childurl存放类别名称及子网页URL childurl = [] for session in genre_session: elements = session.find_all('h4', attrs={'class': "heading"}) for element in elements: genre = {} genre['name'] = element.find('a').text genre['nextpage'] = element.find('a')['href'] childurl.append(genre) return childurl #定义子网页内容处理函数GET_CHILD_INFO def GET_CHILD_INFO(content,kind): data = BeautifulSoup(content, "html.parser") book_session = data.find_all('ol', attrs={'class': "alphabet fandom index group "}) items = book_session[0].find_all('ul', attrs={'class': "tags index group"}) #定义一个空列表books存放书的类别、名称及评论数 books = [] for item in items: book = {} book['kinds'] = kind book['name'] = item.find('a').text book['reviews'] = item.text.strip().split('\n')[-1].strip().strip('()') books.append(book) return books if __name__ == '__main__': url = 'https://arcHiveofourown.org/media' content = GET_HTML_CONTENT(url) childurl = GET_CHILD_URL(content) row = 1 col = 0 data = [[u'类别',u'名称',u'评论数']] workbook = xlsxwriter.Workbook("data.xlsx") worksheet = workbook.add_worksheet() worksheet.write_row(0,0,data[0]) for k in childurl: kind = k['name'] nexturl = k['nextpage'] geturl = 'Https://archiveofourown.org' + nexturl txt = GET_HTML_CONTENT(geturl) books = GET_CHILD_INFO(txt,kind) for info in books: worksheet.write(row, col, info['kinds']) worksheet.write(row, col + 1, info['name']) worksheet.write(row, col + 2, info['reviews']) row += 1 workbook.close()
运行结果如图1-3所示:
图1-3
--结束END--
本文标题: 利用requests+BeautifulSoup爬取网页关键信息
本文链接: https://lsjlt.com/news/229907.html(转载时请注明来源链接)
有问题或投稿请发送至: 邮箱/279061341@qq.com QQ/279061341
2024-05-24
2024-05-24
2024-05-24
2024-05-24
2024-05-24
2024-05-24
2024-05-24
2024-05-24
2024-05-24
2024-05-24
回答
回答
回答
回答
回答
回答
回答
回答
回答
回答
0