Python 官方文档:入门教程 => 点击学习
在写爬虫下载一个网页中的多个链接文件时(Http://blog.sina.com.cn/s/blog_740773f40100ywyg.html ),使用多线程会提高下载速度。 使用线程池能够简单的解决这
from threadpool import *pool = ThreadPool(poolsize)
requests = makeRequests(some_callable, list_of_args, callback)
[pool.putRequest(req) for req in requests]
pool.wait()
result = request.callable(*request.args, **request.kwds)
import cookielib
import urllib2
import Socket
import os
from bs4 import BeautifulSoup
import threadpool
import threading
def download(pdfUrl):
folder = 'matlab_pdf'
mutex.acquire(10)
if not os.path.exists(folder):
os.makedirs(folder)
mutex.release()
name = pdfUrl.split('/')[-1]
try:
status = False
f = open(os.path.join(folder,name),'wb')
f.write(urllib2.urlopen(pdfUrl).read())
f.close()
status = True
except Exception as err:
print err
f.close()
return (name,status)
def print_result(request, result):
print "the %s is %s" % (result[0], 'downloaded!' if result[1] else 'can not find.')
initUrl = r"http://blog.sina.com.cn/s/blog_740773f40100ywyg.html"
socket.setdefaulttimeout(10)
cj = cookielib.Cookiejar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders = [('User-agent','Mozilla/4.0 (compatible; MSIE 6.0; windows NT 5.1)')]
urllib2.install_opener(opener)
resp = urllib2.urlopen(initUrl).read()
a_list = BeautifulSoup(resp).find_all('a')
urls = [a.get('href') for a in a_list]
pdfUrls = [url for url in urls if url and len(url)>3 and url[-3:]=='pdf']
mutex = threading.Lock()
pool = threadpool.ThreadPool(30)
requests = threadpool.makeRequests(download,pdfUrls,print_result)
[pool.putRequest(req) for req in requests]
pool.wait()
import cookielib
import urllib2
import socket
import os
from bs4 import BeautifulSoup
import threadpool
import threading
def download(name,pdfUrl):
folder = 'matlab_pdf_test'
mutex.acquire(10)
if not os.path.exists(folder):
os.makedirs(folder)
mutex.release()
# name = pdfUrl.split('/')[-1]
#print 'this is '+name+pdfUrl
try:
status = False
f = open(os.path.join(folder,name),'wb')
f.write(urllib2.urlopen(pdfUrl).read())
f.close()
status = True
except Exception as err:
print err
f.close()
return (name,status)
def print_result(request, result):
print "the %s is %s" % (result[0], 'downloaded!' if result[1] else 'can not find.')
initUrl = r"http://blog.sina.com.cn/s/blog_740773f40100ywyg.html"
socket.setdefaulttimeout(10)
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders = [('User-agent','Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)')]
urllib2.install_opener(opener)
resp = urllib2.urlopen(initUrl).read()
a_list = BeautifulSoup(resp).find_all('a')
urls = [a.get('href') for a in a_list]
pdfUrls = [url for url in urls if url and len(url)>3 and url[-3:]=='pdf']
v = list(map(lambda x: [x.split('/')[-1],x], pdfUrls))
u = [None for i in range(len(pdfUrls))]
mutex = threading.Lock()
pool = threadpool.ThreadPool(30)
requests = threadpool.makeRequests(download,zip(v,u),print_result)
[pool.putRequest(req) for req in requests]
pool.wait()
import cookielib
import urllib2
import socket
import os
from bs4 import BeautifulSoup
import threadpool
import threading
def download(name,pdfUrl):
folder = 'matlab_pdf_test'
mutex.acquire(10)
if not os.path.exists(folder):
os.makedirs(folder)
mutex.release()
# name = pdfUrl.split('/')[-1]
#print 'this is '+name+pdfUrl
try:
status = False
f = open(os.path.join(folder,name),'wb')
f.write(urllib2.urlopen(pdfUrl).read())
f.close()
status = True
except Exception as err:
print err
f.close()
return (name,status)
def print_result(request, result):
print "the %s is %s" % (result[0], 'downloaded!' if result[1] else 'can not find.')
initUrl = r"http://blog.sina.com.cn/s/blog_740773f40100ywyg.html"
socket.setdefaulttimeout(10)
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders = [('User-agent','Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)')]
urllib2.install_opener(opener)
resp = urllib2.urlopen(initUrl).read()
a_list = BeautifulSoup(resp).find_all('a')
urls = [a.get('href') for a in a_list]
pdfUrls = [url for url in urls if url and len(url)>3 and url[-3:]=='pdf']
#v = list(map(lambda x: [x.split('/')[-1],x], pdfUrls))
v = list(map(lambda x: {'name':x.split('/')[-1],'pdfUrl':x}, pdfUrls))
u = [None for i in range(len(pdfUrls))]
mutex = threading.Lock()
pool = threadpool.ThreadPool(30)
requests = threadpool.makeRequests(download,zip(u,v),print_result)
[pool.putRequest(req) for req in requests]
pool.wait()
--结束END--
本文标题: python threadpool多线程
本文链接: https://lsjlt.com/news/187557.html(转载时请注明来源链接)
有问题或投稿请发送至: 邮箱/279061341@qq.com QQ/279061341
2024-03-01
2024-03-01
2024-03-01
2024-02-29
2024-02-29
2024-02-29
2024-02-29
2024-02-29
2024-02-29
2024-02-29
回答
回答
回答
回答
回答
回答
回答
回答
回答
回答
0