最近在做淘米网的视频死链判断,但是淘米网大部分都是调用的乐视和优酷,土豆等视频网站的接口。所以想判断死链,必须抓到对应的视频源网站的url.
比如,淘米网的视频地址为http://play.v.61.com/comic-play/3145/24.shtml
,我们希望能够自动的找到对应的源地址,通过程序最终得到的是http://www.letv.com/ptv/pplay/37223/24.html(乐视的)
程序设计思路如下:
本人对C++的网络编程以及抓包程序设计不熟悉,而且没必要舍近求远。所以结合了watir和ruby以及httpwatch进行。
require 'net/http'
require 'watir'
require 'win32ole'
require 'watir'
$KCODE='e'
require 'timeout'
def setup
# Attach HttpWatch
begin
WIN32OLE.codepage = WIN32OLE::CP_UTF8
control = WIN32OLE.new('HttpWatch.Controller') #新创建一个#win32ole的httpwatch实体对象
httpWatchVer = '7.0.23'#指定httpwatch版本号
@ie = Watir::IE.new
@ie.speed = :fast
#@ie.visible=0
@ie.logger.level = Logger::ERROR
rescue StandardError=>e #因为需要解析多个视频链接,每一个链接打开后均需要重新创建httpwatch对象,所以偶尔会出现异常,所以进行捕获
puts "ie create error!" + e.message
control = WIN32OLE.new('HttpWatch.Controller') #出现异常后,再次执行上次操作
httpWatchVer = '7.0.23'
@ie = Watir::IE.new
@ie.speed = :fast
#@ie.visible=0
@ie.logger.level = Logger::ERROR
end
# Attach HttpWatch to IE
@plugin = control.ie.Attach(@ie.ie)
# Start Recording HTTP traffic
@plugin.Clear() #每次执行解析前清除上次记录
@plugin.Log.EnableFilter(false)
end
def get_source_url(url)
#ie.goto('http://www.google.cn/')
@plugin.Clear()
@plugin.Record() #开始录制httpwatch脚本
@ie.goto(url)
#control.Wait( plugin, -1 ) # don't return until the page loads
# Start recording
#@ie.maximize
@value=[] #save the pid and epid, used for letv
@youkuid=[] #used for youku
#plugin.Log.Entries.each do |entry|summary = plugin.Log.Entries.Summary
sleep 4
@plugin.Stop()
#puts plugin.Log.ole_methods #可以将win32ole对象的方法打印出来
@plugin.Log.Entries.each do |entry|
a=entry.url #获取抓的包的url地址
##puts entry.ole_methods 或 entry.content.ole_methods 可以打印win32ole对象的方法
#puts a
b=entry.content.Data #获取包的content的data部分
# puts "---------------"
if b.index("var from_type = 'youku'")!= nil
# str="var vid ="
# puts "youku"
@yid= b.match(/var vid = '\w{1,30}/).to_s.match(/= '\w{1,30}/).to_s.match(/\w{1,30}/).to_s
@youkuid[0]=@yid
@youkuid[1]="http://v.youku.com/v_show/id_#{@youkuid[0]}.html"
#return youku_url
return @youkuid
# elsif b.index("var from_type = 'letv'")!=nil
else
# puts "letv"
a=entry.url
#puts a
# puts b
if a.index("&pid=")!=nil
# puts a
@pid=a.match(/&pid=\d{1,20}/).to_s.match(/\d{1,20}/).to_s
@value[0]=@pid
end
if a.index("&epid=")!=nil
@epid=a.match(/&epid=\d{1,20}/).to_s.match(/\d{1,20}/).to_s
@value[1]=@epid
end
end
#=end
end
puts "------------"
closeie="taskkill /f /im iexplore.exe"
#system(closeie)
#puts "letv"
@value[2]="http://www.letv.com/ptv/pplay/#{@value[0]}/#{@value[1]}.html"
# return letv_url
return @value
end
def tworun(sourcelog,newlog) #由于网络以及抓包时间导致可能第一次未必成功抓到,所以设计了二次抓包,并将两次结果重定向到一个文件
f2 = File.new(newlog,"w");
File.open(sourcelog,'r') do |f|
a=f.readlines
b=a.length
#puts b
for i in 0..b-1
if a[i].index('letv.com')==nil and a[i].index('youku.com')==nil
f2<<a[i]
end
#.index('letv')
end
end
f2.close
run(newlog,"sencond#{sourcelog}")
all=File.new("#{sourcelog}.result.txt","w")
##puts the secondlog's which have result redirector to result.log
File.open("sencond#{sourcelog}",'r') do |ff|
aa=ff.readlines
bb=aa.length
for i in 0..bb-1
if aa[i]!=nil
puts aa[i]
all<<aa[i]
end
end
end
##puts the sourcelog's which have result redirector to result.log
File.open(sourcelog,'r') do |f|
a=f.readlines
b=a.length
#puts b
for i in 0..b-1
if a[i].index('letv.com')!=nil or a[i].index('youku.com')!=nil
all<<a[i]
end
#.index('letv')
end
end
all.close
cmd="copy #{sourcelog}.result.txt result && del #{sourcelog}.result.txt "
system(cmd)
end
def tear #执行完毕后关闭浏览器
closeie="taskkill /f /im iexplore.exe"
system(closeie)
end
def run(url,result)
f=File.new(result,"w")#存储执行结果
File.open(url,'r') do |b| #将淘米url放到url文件中
c=b.readlines
i=c.length
for i in 0..c.length-1
begin
@ids=get_source_url(c[i])
#p @ids
Timeout::timeout(60) do
if @ids.to_s.index('letv.com')!=nil # letv video
puts 'letv video'
if @ids[0] != "" && @ids[1] != ""&&@ids[0] != nil && @ids[1] != nil
#puts get_source_url(c[i])
join_url=c[i].strip + " "+ @ids[2]+"\n"
puts join_url
f.write(join_url)
else
# join_url=c[i].strip + " "+ get_letv_url(c[i])+"\n"
f.write(c[i].strip+"\n")
end
elsif @ids.to_s.index('youku.com')!=nil #youku video
puts 'youku video'
if @ids[0] != "" && @ids[0] != nil
#puts get_source_url(c[i])
join_url=c[i].strip + " "+ @ids[1]+"\n"
puts join_url
f.write(join_url)
else
# join_url=c[i].strip + " "+ get_letv_url(c[i])+"\n"
f.write(c[i].strip+"\n")
end
end
end
rescue Timeout::Error => e
puts e.message + ""
end
#puts "The taomi url is " + c[i]
#puts get_letv_url(c[i])
end
end
f.close
end
到此为止,函数基本设计完毕,如何看执行效果,
p get_source_url('http://61.hz.letv.com/comic-play/7322/3.shtml')
#p get_source_url('http://play.v.61.com/comic-play/7630/2.shtml')