python 爬链接小区数据

上一篇 / 下一篇  2017-12-30 13:24:08 / 个人分类:python

from bs4 import BeautifulSoup
import requests, csv, threading, threadpool

host = 'http://sh.lianjia.com'
infos = []
links = []
# 线程池
pool = threadpool.ThreadPool(100)


# 打开详细页面
def open_detail(page):
url = host + page
global infos
try:
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
# 标题
title = soup.select_one('h1').text
# 地址
adr = soup.select_one('.adr').attrs['title']
# 区
q = soup.select_one('.t > span').text
# 详细信息div
res_info = soup.select_one('.res-info')
# 价格
price_span = res_info.select_one('.priceInfo').select_one('.p')
price = '暂无挂牌均价'
if price_span is not None:
price = price_span.text.replace('\n', '').strip()
# 年代
year = res_info.select('.col-2 > ol > li')[1].select_one('.other').text.replace('\n', '').strip()

infos.append({
'title': title,
'adr': adr,
'q': q,
'price': price,
'year': year
})
except Exception as ex:
print('%s出错' % url)
print(ex)
print(url)


def page_link(i):
url = 'http://sh.lianjia.com/xiaoqu/d%d' % i
global links
try:
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
links += [link.attrs['href'] for link in soup.select('.info-panel > h2 > a')]
# links = soup.select('.info-panel > h2 > a')
# [pool.putRequest(req) for req in threadpool.makeRequests(open_detail, [link.attrs['href'] for link in links])]
# pool.wait()
except Exception as ex:
print('%s出错' % url)
print(ex)

print('第%d页' % i)


# 多线程打开列表页面 获取详情页的链接
[pool.putRequest(req) for req in threadpool.makeRequests(page_link, range(1, 1380))]
pool.wait()

[pool.putRequest(req) for req in threadpool.makeRequests(open_detail, links)]
pool.wait()

# 写入csv
headers = ['小区信息', '小区地址', '所在区域', '价格 元/平', '建成年代']
with open('info.csv', 'w', newline='') as f:
writer = csv.writer(f, delimiter=',')
writer.writerow(headers)

for info in infos:
writer.writerow([info['title'], info['adr'], info['q'], info['price'], info['year']])

TAG: Python python 爬虫 链家上海小区

 

评分:0

我来说两句

Open Toolbar