空间管理您的位置: 51Testing软件测试网 » Bryan's doodle-LinuxEndLess » 日志

Things change, roll with the punches.Oh, yeah. Go for it man, jump off the high dive, stare down the barrel of the gun, pee into the wind!

Python下载百度新歌100的代码

上一篇 / 下一篇 2007-02-07 17:00:28 / 个人分类：代码冢

查看( 2462 ) / 评论( 3 ) / 评分( 0 / 0 )

  1 #!/usr/bin/python 
  2 # -*- coding: utf-8 -*- 
  3 # Copyright (c) 2006 UbuntuChina <http://www.ubuntu.org.cn> 
  4 # License: GPLv2 
  5 # Author: oneleaf <oneleaf AT gmail.com> 
  6 # hack by ct <ctqucl AT gmail.com> 
  7 
  8 import httplib
  9 import re
 10 import urllib
 11 import os
 12 import locale
 13 global m,topid
 14 global fsize
 15 fsize=2     #文件大小下限(M) 
 16 m='0'       #'-1'=任意  '0'=mp3 '1'=rm '2'=wma '3'=asf '4'=ram '5'=mid '6'=flash 
 17 topid='1'
 18 
 19 if topid=='0':
 20      topid='/list/newhits.html'         #新歌100 
 21 elif topid=='1':
 22      topid='/topso/mp3topsong.html'     #Top500 
 23 elif topid=='2':
 24      topid='/list/oldsong.html'         #老歌经典 
 25 elif topid=='3':
 26      topid='/list/movies.html'          #电影金曲 
 27 elif topid=='4':
 28      topid='/list/tvs.html'             #电视歌曲 
 29 elif topid=='5':
 30      topid='/minge/mp3topsong.html'     #民歌精选 
 31 elif topid=='6':
 32      topid='/xiaoyuan/mp3topsong.html'  #校园歌曲 
 33 elif topid=='7':
 34      topid='/list/liujinsuiyue.html'    #流金岁月(new) 
 35 elif topid=='8':
 36      topid='/list/yaogun.html'          #摇滚地带 
 37 
 38 
 39 def getdownfileurl(url):                #获取歌曲页的试听URL 
 40     url = "http://220.181.27.54/m"+url
 41     tn = re.search('&tn=(.*)&word',url).group(0)
 42     url=url.replace(tn,'&tn=baidusg,mp3%20%20&word')
 43     try:
 44         urlopen = urllib.URLopener()
 45         fp=urlopen.open(url)
 46         data = fp.read()
 47         fp.close()
 48     except IOError, errmsg:
 49         print errmsg
 50     expression2='"_blank">(.*)</a></a></li>'
 51     url = re.search(expression2, data).group(0)[16:-13]
 52     try:
 53         url="http://"+urllib.quote(url)
 54     except:pass
 55     #print u"发现 "+url 
 56     return url
 57 
 58 def getdownurl(url):                     #从歌曲页抓取URL列表 
 59     urllist=[]
 60     urllist1=[]
 61     urllist2=[]
 62     conn = httplib.HTTPConnection('mp3.baidu.com')
 63     conn.request("GET",url)
 64     response = conn.getresponse()
 65     html=response.read()
 66     conn.close()
 67     expression2='http://220.181.27.54/m(.*)" target'
 68     listSentence2 = re.findall(expression2, html)     #抓取链接列表 
 69     filesize=re.findall('<td>(.*)M</td>',html)        #抓取文件大小 
 70     lineno=0
 71     while lineno<len(listSentence2):
 72         mp3url=getdownfileurl(listSentence2[lineno])  #转换链接为最终下载地址 
 73         urllist1.append(mp3url)
 74         lineno+=1
 75     urllist=map(None,urllist1,filesize)
 76     return urllist
 77 
 78 def downmp3(url,author,name,filelist):    #下载歌曲 
 79     filename=author+"-"+name;
 80     for i in filelist:
 81         name=unicode(i,locale.getpreferredencoding())
 82         if name.find(filename) == 0:      #忽略 
 83             print u"文件已经下载，忽略。"
 84             return 1
 85     urllists=getdownurl(url)              #获取文件url列表 
 86     lineno=0
 87     while lineno<len(urllists):
 88         print u"尝试",urllists[lineno][0]
 89         ext=urllists[lineno][0][-4:]      #获取文件名后缀(最后4位) 
 90         try:
 91             lineno+=1
 92             print urllists[lineno-1][1] +'M'
 93             if float(urllists[lineno-1][1])>float(fsize) :   #大小符合则下载 
 94                   urlopen = urllib.URLopener()
 95                   fp=urlopen.open(urllists[lineno-1][0])
 96                   data = fp.read()
 97                   fp.close()
 98                   filename=filename+ext;
 99                   file=open(filename,'w+b')
100                   file.write(data)
101                   file.close()
102                   print u"下载成功!"
103                   return 1
104             elif float(urllists[lineno][1])<float(fsize) :   #不符则略过 
105                   print u"文件太小,忽略!"
106         except:
107             continue
108     return 0
109 
110 if __name__ == "__main__":
111     conn = httplib.HTTPConnection('list.mp3.baidu.com')
112     conn.request("GET",topid )  #类型 
113     response = conn.getresponse()
114     html=response.read().decode('gbk')
115     conn.close()
116     expression1='border">(.*).</td>'
117     expression2='><a href="http://mp3.baidu.com/m(.*)</a>'
118     expression3='href="http://mp3.baidu.com/m(.*)</td>'
119     listSentence1 = re.findall(expression1, html)   #编号特征 
120     listSentence2 = re.findall(expression2, html)   #歌曲名特征 
121     listSentence3 = re.findall(expression3, html)   #歌手名特征 
122     lineno=0
123     while lineno<len(listSentence1):
124        listSentence2[lineno]=listSentence2[lineno].replace('m=-1','m=' + m) #指定格式 
125        url=re.search('(.*)target',listSentence2[lineno])
126        url='/m'+url.group(0)[:-8]
127        idno=listSentence1[lineno]
128        name=re.search('blank>(.*)',listSentence2[lineno])
129        name=name.group(0)[6:]
130        dirty=re.search('</A>/<A  href=(.*) target=_blank>',listSentence3[lineno])
131        if dirty is not None :             #合唱 
132       author1=re.search('>(.*)</A>/<A',listSentence3[lineno])
133       author1=author1.group(0)[1:-7]
134       author2=re.search('/<A  href=(.*)</A>',listSentence3[lineno])
135       author2=re.search('>(.*)<',author2.group(0))
136       author2=author2.group(0)[1:-1]
137       author=author1 + '+' + author2
138        elif dirty is None :               #独唱 
139            author=re.search('blank>(.*)</',listSentence3[lineno])
140            author=author.group(0)[6:-2]
141        print u"开始下载",idno,name,author
142        filelist=os.listdir('.');
143        if downmp3(url,author,name,filelist)==0:  #判断失败 
144           print u"下载",author,name,u'失败！'
145        lineno+=1
146 
转自：http://forum.ubuntu.org.cn/viewtopic.php?t=15682