Things change, roll with the punches.Oh, yeah. Go for it man, jump off the high dive, stare down the barrel of the gun, pee into the wind!
Python下载百度新歌100的代码
上一篇 /
下一篇 2007-02-07 17:00:28
/ 个人分类:代码冢
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
3 # Copyright (c) 2006 UbuntuChina <http://www.ubuntu.org.cn>
4 # License: GPLv2
5 # Author: oneleaf <oneleaf AT gmail.com>
6 # hack by ct <ctqucl AT gmail.com>
7
8 import httplib
9 import re
10 import urllib
11 import os
12 import locale
13 global m,topid
14 global fsize
15 fsize=2 #文件大小下限(M)
16 m='0' #'-1'=任意 '0'=mp3 '1'=rm '2'=wma '3'=asf '4'=ram '5'=mid '6'=flash
17 topid='1'
18
19 if topid=='0':
20 topid='/list/newhits.html' #新歌100
21 elif topid=='1':
22 topid='/topso/mp3topsong.html' #Top500
23 elif topid=='2':
24 topid='/list/oldsong.html' #老歌经典
25 elif topid=='3':
26 topid='/list/movies.html' #电影金曲
27 elif topid=='4':
28 topid='/list/tvs.html' #电视歌曲
29 elif topid=='5':
30 topid='/minge/mp3topsong.html' #民歌精选
31 elif topid=='6':
32 topid='/xiaoyuan/mp3topsong.html' #校园歌曲
33 elif topid=='7':
34 topid='/list/liujinsuiyue.html' #流金岁月(new)
35 elif topid=='8':
36 topid='/list/yaogun.html' #摇滚地带
37
38
39 def getdownfileurl(url): #获取歌曲页的试听URL
40 url = "http://220.181.27.54/m"+url
41 tn = re.search('&tn=(.*)&word',url).group(0)
42 url=url.replace(tn,'&tn=baidusg,mp3%20%20&word')
43 try:
44 urlopen = urllib.URLopener()
45 fp=urlopen.open(url)
46 data = fp.read()
47 fp.close()
48 except IOError, errmsg:
49 print errmsg
50 expression2='"_blank">(.*)</a></a></li>'
51 url = re.search(expression2, data).group(0)[16:-13]
52 try:
53 url="http://"+urllib.quote(url)
54 except:pass
55 #print u"发现 "+url
56 return url
57
58 def getdownurl(url): #从歌曲页抓取URL列表
59 urllist=[]
60 urllist1=[]
61 urllist2=[]
62 conn = httplib.HTTPConnection('mp3.baidu.com')
63 conn.request("GET",url)
64 response = conn.getresponse()
65 html=response.read()
66 conn.close()
67 expression2='http://220.181.27.54/m(.*)" target'
68 listSentence2 = re.findall(expression2, html) #抓取链接列表
69 filesize=re.findall('<td>(.*)M</td>',html) #抓取文件大小
70 lineno=0
71 while lineno<len(listSentence2):
72 mp3url=getdownfileurl(listSentence2[lineno]) #转换链接为最终下载地址
73 urllist1.append(mp3url)
74 lineno+=1
75 urllist=map(None,urllist1,filesize)
76 return urllist
77
78 def downmp3(url,author,name,filelist): #下载歌曲
79 filename=author+"-"+name;
80 for i in filelist:
81 name=unicode(i,locale.getpreferredencoding())
82 if name.find(filename) == 0: #忽略
83 print u"文件已经下载,忽略。"
84 return 1
85 urllists=getdownurl(url) #获取文件url列表
86 lineno=0
87 while lineno<len(urllists):
88 print u"尝试",urllists[lineno][0]
89 ext=urllists[lineno][0][-4:] #获取文件名后缀(最后4位)
90 try:
91 lineno+=1
92 print urllists[lineno-1][1] +'M'
93 if float(urllists[lineno-1][1])>float(fsize) : #大小符合则下载
94 urlopen = urllib.URLopener()
95 fp=urlopen.open(urllists[lineno-1][0])
96 data = fp.read()
97 fp.close()
98 filename=filename+ext;
99 file=open(filename,'w+b')
100 file.write(data)
101 file.close()
102 print u"下载成功!"
103 return 1
104 elif float(urllists[lineno][1])<float(fsize) : #不符则略过
105 print u"文件太小,忽略!"
106 except:
107 continue
108 return 0
109
110 if __name__ == "__main__":
111 conn = httplib.HTTPConnection('list.mp3.baidu.com')
112 conn.request("GET",topid ) #类型
113 response = conn.getresponse()
114 html=response.read().decode('gbk')
115 conn.close()
116 expression1='border">(.*).</td>'
117 expression2='><a href="http://mp3.baidu.com/m(.*)</a>'
118 expression3='href="http://mp3.baidu.com/m(.*)</td>'
119 listSentence1 = re.findall(expression1, html) #编号特征
120 listSentence2 = re.findall(expression2, html) #歌曲名特征
121 listSentence3 = re.findall(expression3, html) #歌手名特征
122 lineno=0
123 while lineno<len(listSentence1):
124 listSentence2[lineno]=listSentence2[lineno].replace('m=-1','m=' + m) #指定格式
125 url=re.search('(.*)target',listSentence2[lineno])
126 url='/m'+url.group(0)[:-8]
127 idno=listSentence1[lineno]
128 name=re.search('blank>(.*)',listSentence2[lineno])
129 name=name.group(0)[6:]
130 dirty=re.search('</A>/<A href=(.*) target=_blank>',listSentence3[lineno])
131 if dirty is not None : #合唱
132 author1=re.search('>(.*)</A>/<A',listSentence3[lineno])
133 author1=author1.group(0)[1:-7]
134 author2=re.search('/<A href=(.*)</A>',listSentence3[lineno])
135 author2=re.search('>(.*)<',author2.group(0))
136 author2=author2.group(0)[1:-1]
137 author=author1 + '+' + author2
138 elif dirty is None : #独唱
139 author=re.search('blank>(.*)</',listSentence3[lineno])
140 author=author.group(0)[6:-2]
141 print u"开始下载",idno,name,author
142 filelist=os.listdir('.');
143 if downmp3(url,author,name,filelist)==0: #判断失败
144 print u"下载",author,name,u'失败!'
145 lineno+=1
146
转自:http://forum.ubuntu.org.cn/viewtopic.php?t=15682
收藏
举报
TAG:
Python
代码冢