最近在尝试下载51cto中整个课程视频,难点在于它的获取m3u8文件的网址采用js加密,通过抓包分析js文件,得到加密方式,然后使用python下载。下载ts文件使用了多线程。
下载视频的步骤主要有以下几点:
1:找到网页的视频索引文件m3u8
2:分析m3u8文件,找到各个ts文件下载地址,将ts文件下载到临时文件中
3:使用ffmpeg工具将ts文件合并成视频(需要安装ffmpeg软件,请自行百度下载)
4:删除临时文件
主要代码如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import requests, os
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
import hashlib
import threading
import queue
def get_html(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Host':'edu.51cto.com',
}
r = requests.get(url, headers=headers)
r.encoding = 'utf-8'
return r.text
def parse_couse_id(url):
content = get_html(url)
bsobj = BeautifulSoup(content, 'lxml')
lesson_lists = bsobj.find("div", {"class":"lessonList"}).ul.findAll("li", {"class":"lesson"})
lesson_id_list = []
name_list = []
for item in lesson_lists:
name = item.a["title"]
lesson_id = item.a["href"].split('?id=')[1]
lesson_id_list.append(lesson_id)
name_list.append(name)
return lesson_id_list, name_list
def get_m3u8_url(vid):
sign = "eDu_51Cto_siyuanTlw"
auth_md5 = (vid + sign).encode("utf-8")
# print(auth_md5)
auth_str = hashlib.md5(auth_md5).hexdigest()
# print(auth_str)
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
'Host':'edu.51cto.com',
'Refer':'http://edu.51cto.com/center/course/lesson/index?id='+vid,
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Connection':'keep-alive',
}
parms = {
'type':'course',
'lesson_type':'course',
'sign':'auth_str',
'lesson_id':vid,
}
url = 'http://edu.51cto.com/center/player/play/get-lesson-info'
r = requests.get(url, headers=headers, params=parms)
r.encoding = 'utf-8'
# print(r.json())
return (r.json()['dispatch'][0]['url'])
def m3u8_list(m3u8_url, title):
urlretrieve(m3u8_url, title)
m3u8_url_list = []
with open(title, 'rt') as fout:
# print(fout.read())
flag = False
for f in fout.readlines():
if f.startswith('#EXTINF'):
flag = True
elif flag:
# print(f)
m3u8_url_list.append(f[0:-1])
flag = False
else:
pass
if os.path.exists(title):
os.remove(title)
return m3u8_url_list
def down(url, path):
def Schedule(a,b,c):
'''''
a:已经下载的数据块
b:数据块的大小
c:远程文件的大小
'''
per = 100.0 * a * b / c
if per > 100 :
per = 100
print('%.2f%%' % per)
urlretrieve(url, path)
def down_ts(m3u8_url_list, path='', title=''):
for index, url in enumerate(m3u8_url_list):
# print(url)
path_name = path + '\\' + str(index) + '.ts'
down(url, path_name)
print('[%d/%d]\t\tNow Downing %s_%d.ts' %(index, len(m3u8_url_list)-1, title, index))
'''
# 使用多线程下载视频
def down_ts(m3u8_url_list, path='', title=''):
url_lists = []
for index, url in enumerate(m3u8_url_list):
# print(url)
path_name = path + '\\' + str(index) + '.ts'
url_lists.append((url, path_name))
def consumer(url_lists, url_que):
for urls in url_lists:
url_que.put(urls)
def producter(url_que):
while True:
urls = url_que.get()
urlretrieve(*urls)
print("Downing:", urls[1])
url_que.task_done()
url_que = queue.Queue()
for n in range(4):
down_thread = threading.Thread(target=producter, args=(url_que,))
down_thread.start()
consumer(url_lists, url_que)
url_que.join()
'''
def write_confile(path, ts_len):
txt = ''
for i in range(ts_len):
txt += 'file \'%s/%d.ts\'\n' %(path, i)
with open('confile.txt', 'w') as fout:
fout.write(txt[0:-1])
def delete_file(file_path):
if os.path.exists(file_path):
os.remove(file_path)
else:
print('ERROR : Path not exist [%s]' %file_path)
# 合并ts视频文件
def merge_ts_video(title, v_type='.mp4'):
cmd = 'ffmpeg -f concat -i confile.txt -c copy %s%s' %(title, v_type)
print(cmd)
p = subprocess.Popen(cmd, stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = p.communicate()
print(str(out, 'utf-8'))
print(str(err, 'utf-8'))
def usage():
print()
print('Usage: down_51cto_video.py url')
print('Example: down_51cto_video.py http://edu.51cto.com/center/course/lesson/index?id=98717')
print()
def main(url):
lesson_id_list, name_list = parse_couse_id(url)
for index, url_id in enumerate(lesson_id_list):
m3u8_url = get_m3u8_url(url_id)
v_name = name_list[index]
m3u8_url_list = m3u8_list(m3u8_url, url_id)
ts_len = len(m3u8_url_list)
if os.path.exists(url_id):
os.removedirs(url_id)
os.mkdir(url_id)
down_ts(m3u8_url_list, path=url_id, title=test_name)
write_confile(url_id, ts_len)
try:
merge_ts_video(url_id)
os.rename(url_id+'.mp4', title+'.mp4')
# 删除临时文件
delList = os.listdir(v_key)
for item in delList:
del_path = os.path.join(v_key, item)
os.remove(del_path)
os.removedirs(v_key)
os.remove('confile.txt')
os.remove(title+'.m3u8')
except Exception as e:
print(e)
if __name__ == '__main__':
try:
url = sys.argv[1]
except Exception as e:
# print(e)
usage()
else:
main(url)
目前上面的代码只能下载免费的视频!!!
小提示:该代码也能下载51cto中的付费视频,不过需要自己修改其中的代码,大家可以尝试一下!!!
[培训]内核驱动高级班,冲击BAT一流互联网大厂工作,每周日13:00-18:00直播授课