免费网页数据抓取采集 python实现一个多线程网页下载器
上一篇 / 下一篇 2011-10-23 14:52:56 / 个人分类:python
文章来源
- 文章来源:【转载】
- #!/usr/bin/envpython 51Testing软件测试网*me7LM(H
- # -*- coding:utf-8 -*-
u}2V![O}Sf0 - import urllib, httplib
A3u1^]?N-Yl5d9U0 - import thread
(S#aG#Dd6w0 - import time
qiW*[KZ0 - from Queue import Queue, Empty, Full 51Testing软件测试网sqdeXnS^
- HEADERS = {"Content-type": "application/x-www-form-urlencoded", 51Testing软件测试网e3`P/X+Z/U
- 'Accept-Language':'zh-cn',
H BPPxk0 - 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0;Windows NT 5.0)', 51Testing软件测试网$EKi'i(R!|5j|7~-Q
- "Accept": "text/plain"} 51Testing软件测试网n
jfm g2c u0i
- UNEXPECTED_ERROR = -1 51Testing软件测试网\r!QF%k
_/`d+[
- POST = 'POST'
Ba4Q+d&['nj#G0 - GET = 'GET' 51Testing软件测试网uv4Vdsd;dK
- def base_log(msg): 51Testing软件测试网H/d"C7o5R
- print msg 51Testing软件测试网%biN
l,j
- def base_fail_op(task, status, log): 51Testing软件测试网H2{Mkv2R&h$P6QF
- log('fail op. task = %s, status = %d'%(str(task), status)) 51Testing软件测试网1k]L,`z+t
]h
- def get_remote_data(tasks, results, fail_op = base_fail_op, log = base_log): 51Testing软件测试网(jI@fC#pU;TU~8^Y
- while True:
&BZy3e!|k NX(BFv0 - task = tasks.get()
.sI'h*FR7T:^9`E0 - try:
#b4n]'g9Y(G _9M:L_sp0 - tid = task['id']
O svxlF0 - hpt = task['conn_args'] # hpt <= host:port, timeout 51Testing软件测试网2W-y;q
^4k7U$Q
- except KeyError, e: 51Testing软件测试网$ObsCT h
- log(str(e))
w(qk K3\X$iI0 - continue
*G TXaN+K~C0 - log('thread_%s doing task %d'%(thread.get_ident(), tid)) 51Testing软件测试网ac ^B ](l(ytb
- #log('hpt = ' + str(hpt))
j qp fx-g;r(ze0 - conn = httplib.HTTPConnection(**hpt)
:E f%o%z&p;Bv0 -
xyX&U1M.F Q.C0 - try: 51Testing软件测试网4E*ut*VyT6Mj
- params = task['params']
6Z:DQ/~/d9s,Q[0 - except KeyError, e:
g'g&L? _ B#u0 - params = {}
h-h7Q`^-W1wDy y0 - params = urllib.urlencode(params) 51Testing软件测试网Cdd4tc
- #log('params = ' + params)
F)?O,T Ny0 -
6a9H'u)k\~Kk*A&a0 - try:
;{2qOJ ?#}_0 - method = task['method'] 51Testing软件测试网K1j-lfr[
?
- except KeyError: 51Testing软件测试网]LIo.S
z!V(p
- method = 'GET'
rp\-H/d r0 - #log('method = ' + method)
9w? F\S/i I0nt0 -
J!Z1c*m2Q a&}:KX0 - try:
;|2a,u)P7u+|0 - url = task['url'] 51Testing软件测试网~-|Q/rwF-a$v
- except KeyError:
4^^2I*Jy8s$P/w0 - url = '/' 51Testing软件测试网V~ I"WvO.y
- #log('url = ' + url) 51Testing软件测试网,U0S'o[|rP,~!wH
- 51Testing软件测试网 h{^
L.U:F p8h2j
- headers = HEADERS 51Testing软件测试网r"W@J%z1y~n)Ym
- try: 51Testing软件测试网+|V\A PbW9X
- tmp = task['headers']
0w$eDIVV0 - except KeyError, e:
r4d*y~_3Gt_NO0 - tmp = {} 51Testing软件测试网X0_U,W(e*CJ
- headers.update(tmp) 51Testing软件测试网_+]!oZ.s,}.IV
- #log('headers = ' + str(headers))
#jt&k\4m2YSD8Ey0 - headers['Content-Length'] = len(params) 51Testing软件测试网'ujsn y
- 51Testing软件测试网"G!p w3yl w?3g1s
Y
- try: 51Testing软件测试网+K^8G9U+LlX
- if method == POST:
SO~+_%P0 - conn.request(method, url, params, headers) 51Testing软件测试网cVm%S
gX
- else: