如何下载web资源
[TOC] 如何下载web资源
目的
发现有些书是以web页面的方式给用户看的,一张一张,很难一次性下载
有没有办法一次性下载他们呢?
比如书
研究
test 1: chrome extension
上网查到很多chrome extension但是他们都认不到页面内的连接。这是因为页面里面根本没有连接
biru
页面链接如下
<a href="javascript:void(0);" onclick="probation.readBook(this);" id="678612" ref="/openresources/teach_ebook/uncompressed/13780/OEBPS/Text/chapter33.html#heading_id_3">3.1 协商原则</a>
该链接其实最终变成http://www.hzcourse.com/resource/readBook?path=/openresources/teach_ebook/uncompressed/13780/OEBPS/Text/chapter33.html
所以怪不得扩展不认识了
看来还是要自己写一个了
最简单就是用python了
测试以上链接
C:\Users\cutep>python -m wget http://www.hzcourse.com/resource/readBook?path=/openresources/teach_ebook/uncompressed/13780/OEBPS/Text/chapter33.html -o 33.html
100% [................................................................................] 4000 / 4000
Saved under 33.html
成功!
test 2: 最终写了如下python脚本
import os,sys
import subprocess
#from selenium import webdriver
#from urllib2 import urlopen
import requests
import wget
def my_system(cmd):
print(cmd)
os.system(cmd)
def download(url, file):
cmd = 'python -m wget %s -o "%s"'%(url, file)
#my_system(cmd)
ret = subprocess.run(cmd, timeout=8).returncode
print(' %s return %d'%(cmd, ret))
if ret != 0:
raise 'error, %s return %d'%(cmd, ret)
#wget.download(url, file)
def download_chapter(click_url, file):
download('http://www.hzcourse.com/resource/readBook?path=%s'%click_url, file)
def get_bookname(cont):
s='<div class="book-name">'
p1 = cont.find(s)
p1 = p1 + len(s)
p1 = cont.find('<span>', p1)
p1 = p1 + len('<span>')
p2 = cont.find('</span>', p1)
#print(p1, p2)
name=cont[p1:p2]
return name
def get_value_token(cont):
s='"ebookId" value="'
p1 = cont.find(s)
p1 = p1 + len(s)
p2 = cont.find('"/>', p1)
#print(p1, p2)
ebookId=cont[p1:p2]
s2 = 'name="token" value="'
p3 = cont.find(s2, p2)
p3 = p3 + len(s2)
p4 = cont.find('"/>', p3)
#print(p3, p4)
token=cont[p3:p4]
print('ebookId, token %s %s'%(ebookId, token))
return [ebookId, token]
def download_book(main_link):
main_file = 'main.html'
my_system('del %s'%main_file)
try:
download(main_link, main_file)
except:
print("Unexpected error:", sys.exc_info()[0])
return
main_cont = open(main_file, 'r', encoding='utf-8').read()
[ebookId, token] = get_value_token(main_cont)
bookname = get_bookname(main_cont)
print(bookname)
dest_folder = bookname
my_system('md "%s"'%dest_folder)
my_system('copy /y "%s" "%s"'%(main_file, dest_folder))
finish_file = '%s\\%s'%(dest_folder, 'finish.txt')
if os.path.isfile(finish_file): return
#response = requests.post('http://www.hzcourse.com/web/refbook/queryAllChapterList', data={'ebookId':15917,'token':"aec187ed5b3a41728b0fb8bd82c3be22"})
try:
response = requests.post('http://www.hzcourse.com/web/refbook/queryAllChapterList', data={'ebookId':ebookId,'token':token})
resp_json = response.json()
except:
print("Unexpected error:", sys.exc_info()[0])
return
#print(resp_json)
nExcept = 0
for i in resp_json['data']['data']:
ref_link = i['ref']
file = ref_link[ref_link.rfind('/')+1:]
print(ref_link, file)
dest_file = '%s\\%s'%(dest_folder, file)
if os.path.isfile(dest_file): continue
try:
download_chapter(ref_link, dest_file)
except:
nExcept = nExcept+1
print("Unexpected error:", sys.exc_info()[0])
if nExcept>4: return
open(finish_file,'w').write('done')
files = [
'http://www.hzcourse.com/web/refbook/probationAll/6736/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/6736/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/6856/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/7899/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/7249/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/7165/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/7186/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/7523/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/6965/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/6826/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/6166/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/6188/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/6853/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/4599/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/6759/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/6772/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/6754/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/6755/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/6856/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/6965/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/7027/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/6736/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/6821/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/6622/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/6606/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/6751/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/6652/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/6050/aec187ed5b3a41728b0fb8bd82c3be22',
'http://www.hzcourse.com/web/refbook/probationAll/6965/aec187ed5b3a41728b0fb8bd82c3be22',
]
def unique(list1):
# insert the list to the set
list_set = set(list1)
# convert the set to the list
unique_list = list(list_set)
return unique_list
def main():
files2= unique(files)
for file in files2:
download_book(file)
main()
Test result
Saved under chapter51.xhtml
/openresources/teach_ebook/uncompressed/16571/OEBPS/Text/chapter52.xhtml chapter52.xhtml
python -m wget http://www.hzcourse.com/resource/readBook?path=/openresources/teach_ebook/uncompressed/16571/OEBPS/Text/chapter52.xhtml -o chapter52.xhtml
100% [................................................................................] 1058 / 1058
Saved under chapter52.xhtml
/openresources/teach_ebook/uncompressed/16571/OEBPS/Text/chapter53.xhtml chapter53.xhtml
python -m wget http://www.hzcourse.com/resource/readBook?path=/openresources/teach_ebook/uncompressed/16571/OEBPS/Text/chapter53.xhtml -o chapter53.xhtml
100% [................................................................................] 4625 / 4625
Saved under chapter53.xhtml
/openresources/teach_ebook/uncompressed/16571/OEBPS/Text/chapter54.xhtml chapter54.xhtml
python -m wget http://www.hzcourse.com/resource/readBook?path=/openresources/teach_ebook/uncompressed/16571/OEBPS/Text/chapter54.xhtml -o chapter54.xhtml
100% [..................................................................................] 705 / 705
Saved under chapter54.xhtml
/openresources/teach_ebook/uncompressed/16571/OEBPS/Text/chapter55.xhtml chapter55.xhtml
python -m wget http://www.hzcourse.com/resource/readBook?path=/openresources/teach_ebook/uncompressed/16571/OEBPS/Text/chapter55.xhtml -o chapter55.xhtml
100% [................................................................................] 1814 / 1814
Saved under chapter55.xhtml
/openresources/teach_ebook/uncompressed/16571/OEBPS/Text/chapter56.xhtml chapter56.xhtml
python -m wget http://www.hzcourse.com/resource/readBook?path=/openresources/teach_ebook/uncompressed/16571/OEBPS/Text/chapter56.xhtml -o chapter56.xhtml
100% [..............................................................................] 10025 / 10025
Saved under chapter56.xhtml
/openresources/teach_ebook/uncompressed/16571/OEBPS/Text/chapter57.xhtml chapter57.xhtml
python -m wget http://www.hzcourse.com/resource/readBook?path=/openresources/teach_ebook/uncompressed/16571/OEBPS/Text/chapter57.xhtml -o chapter57.xhtml
其他
下面这个是啥框架写的?
A: avalonjs
<li ms-for="bookChapter in @bookChapters">
<a href="javascript:void(0);" onclick="probation.readBook(this);" ms-attr="{id : bookChapter.id, ref : bookChapter.ref}"></a>
</li>
bookChapter在哪里定义的?
var probation = {
search:function(){
var key = $.trim($("#condition").val());
ebookRead.queryEbookChapterList(key);
},
queryEbookChapterList:function(key){
var ebookId = $.trim($("#ebookId").val());
var token = $.trim($("#token").val());
debugger;
jQuery.ajax({
type : "post" ,
url : "web/refbook/queryAllChapterList",
dataType : "json" ,
data : {ebookId:ebookId,key:key,token:token},
success : function(obj) {
if(obj.data.code==1){
var bookChapters = obj.data.data;
if(bookChapters.length > 0){
bookChaptertCtrl.bookChapters = bookChapters;
$("#chapterCont").load();
$("#directories").find("li").first().children("a").click();
}
} else {
alert(obj.data.message);
}
}
});
},
如何获取连接?
万能的chrome F12了