以下是通过爬取预约挂号医院网站的两种方式:
1、reWithhospital.py
#coding=utf-8
import urllib
import urllib2
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
f = open("D:\Spider.txt","w")
type = sys.getfilesystemencoding()
print type
page=1
for page in range(1,58):
url='http://yyk.39.net/shenzhen/hospitals/c_p'+str(page)+'/'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5;
Windows NT)'
headers = { 'User-Agent' : user_agent }
try:
request=urllib2.Request(url,headers = headers)
response=urllib2.urlopen(request)
content = response.read()
pattern=re.compile('<a href="/sz/zonghe/.*?.html" class="yy-name" title=".*?">(.*?)</a>.*?</div>.*?<p>(.*?)</p>.*?<p class="di">.*?<a href="/sz/zonghe/.*?.html" title=".*?">(.*?)</a>.*?</p>',re.S)
items=re.findall(pattern,content)
for item in items:
f.write(item[0]+","+item[1].strip()+','+item[2]+"\n")
except urllib2.URLError,e:
if hasattr(e, "code"):
print e.code
if hasattr(e, "reason"):
print e.reason
f.close()
2、bs4WithHospital.py
#http://python.jobbole.com/81349/ 大多数内容参考该网站
#coding=utf-8
import urllib
import re
import urllib2
import bs4
from bs4 import BeautifulSoup
import json
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
page=1
for page in range(1,2):
url='http://yyk.39.net/shenzhen/hospitals/c_p'+str(page)+'/'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
try:
request=urllib2.Request(url,headers = headers)
response=urllib2.urlopen(request)
content = response.read()
soup=BeautifulSoup(content)
pattern=re.compile('<p class="di"><a href=".*?>(.*?)</a></p>',re.S)
#items=re.findall(pattern,content)
#print soup.prettify()
# for yyname in soup.find_all('a',class_='yy-name'):
# print yyname
L= soup.find_all('p',class_='di')
print type(list(L))
LL= [re.findall(pattern, str(ss))[0] for ss in L]
res_data=json.dumps(LL,ensure_ascii=False,encoding="utf-8")
print res_data
except urllib2.URLError,e:
if hasattr(e, "code"):
print e.code
if hasattr(e, "reason"):
print e.reason
-----文件2为了试验一下列表生成式和打印控制台乱码(通过引入json.dumps的方式可以把中文正常打印出来)的问题,试了许久,值得纪念一下,以后方便自己查看
原理以后再研究研究
-----网络爬虫和bs4引用,来自网站:http://python.jobbole.com/81349/--作者写的特别好,推荐