网页hack程序编写
网页hack程序编写
batch
Q: 脚本当前路径
%~dp0
Q:用tee重定向
cmd 2>&1 | “C:\Program Files\Git\usr\bin\tee” pyDownload.log |
python
使用wget模块下载网页
def download(url, file):
cmd = 'python -m wget %s -o "%s"'%(url, file)
#my_system(cmd)
ret = subprocess.run(cmd, timeout=8).returncode
print(' %s return %d'%(cmd, ret))
if ret != 0:
raise 'error, %s return %d'%(cmd, ret)
获取a和b中间的字符串
def getStrIn(cont, a, b):
p1 = cont.find(a)
p1 = p1 + len(a)
p2 = cont.find(b, p1)
name=cont[p1:p2]
return name
异常处理
try:
xxx
except:
print("Unexpected error:", sys.exc_info()[0])
发送json数据
try:
response = requests.post('http://www.hzcourse.com/web/refbook/queryAllChapterList', data={'ebookId':ebookId,'token':token})
resp_json = response.json()
except:
print("Unexpected error:", sys.exc_info()[0])
发送数据2
import requests
link = """https://api-zero.livere.com/v1/comments/
list?callback=jQuery1124049866736766120545_
1506309304525&limit=10&offset=1&repSeq=3871836
&requestPath=%2Fv1%2Fcomments%2Flist
&consumerSeq=1020&livereSeq=28583
&smartloginSeq=5154&_=1506309304527"""
headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
r = requests.get(link, headers= headers)
print (r.text)
selenium
模板代碼
import os, sys, requests
from selenium import webdriver
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import time
import logging
class Result:
def __init__(self):
self.text = ''
class TranslatorGoogle:
def __init__(self):
browser = webdriver.Chrome()
browser.maximize_window() # 最大化窗口
self.browser = browser
url = 'https://translate.google.com.hk/?hl=en&tab=rT&sl=en&tl=zh-CN&op=translate'
browser.get(url)
def __del__(self):
self.browser.close()
self.browser.quit()
logging.info ('quited')
def translate(self, content, src='', dest=''):
logging.info ('translating %s', content)
browser = self.browser
wait = WebDriverWait(browser, 10) # 等待加载10s
logging.info('try get input element')
input = wait.until(EC.presence_of_element_located(
(By.XPATH, '/html/body/c-wiz/div/div[2]/c-wiz/div[2]/c-wiz/div[1]/div[2]/div[2]/c-wiz[1]/span/span/div/textarea')))
#time.sleep(3)
logging.info('clear input element')
input.clear()
if 0:
logging.info('send data to input element')
input.send_keys(content)
else:
logging.info('send script to input element')
browser.execute_script('''
//var t = document.evaluate("//textarea", document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
var t = arguments[1];
t.focus();
t.value=arguments[0];
//var event = new CustomEvent('change');
//t.dispatchEvent(event); // no effect
//t.change(); // run error
''', content, input)
input.send_keys(' \n')
logging.info('sleep3s')
time.sleep(3)
logging.info('try get output element')
output = wait.until(EC.presence_of_element_located(
(By.XPATH, "//*[@class='J0lOec']")))
# logging.info('sleep3s')
# time.sleep(3)
#output.clear()
innerText = output.get_attribute("innerText")
logging.info('output is %s', innerText)
result = Result()
result.text = innerText
return result
class TranslatorDeepL:
def __init__(self):
browser = webdriver.Chrome()
browser.maximize_window() # 最大化窗口
self.browser = browser
url = 'https://www.deepl.com/translator'
browser.get(url)
def __del__(self):
#self.browser.close()
#self.browser.quit()
logging.info ('quited')
def translate(self, content, src='', dest=''):
logging.info ('translating %s', content)
browser = self.browser
wait = WebDriverWait(browser, 10) # 等待加载10s
logging.info('try get input element')
input = wait.until(EC.presence_of_element_located(
(By.XPATH, '/html/body/div[2]/div[1]/div[5]/div[2]/div[1]/div[2]/div/textarea')))
#time.sleep(3)
logging.info('clear input element')
input.clear()
if 0:
logging.info('send data to input element')
input.send_keys(content)
else:
logging.info('send script to input element')
browser.execute_script('''
var t = arguments[1];
t.focus();
t.value=arguments[0];
''', content, input)
input.send_keys(' \n')
logging.info('sleep6s')
time.sleep(6)
logging.info('try get output element')
output = wait.until(EC.presence_of_element_located(
(By.XPATH, "/html/body/div[2]/div[1]/div[5]/div[2]/div[3]/div[3]/div[1]/textarea")))
# logging.info('sleep3s')
# time.sleep(3)
#output.clear()
innerText = output.get_attribute("innerText")
logging.info('output is %s', innerText)
result = Result()
result.text = innerText
return result
class Translator(TranslatorDeepL):
pass
Q: click()函数有时候会hang怎么办?
A:不知道
Q: get()函数有时候会hang怎么办?
A:没有好办法。 一般是因为网页加载未完成。如果你觉得已经ok了,点一下浏览器的停止按钮就可以
打开网页
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('http://ebooks.cmanuf.com/all?id=1&type=2&code=AC05')
通过css selector获取网页内容
book = driver.find_element_by_css_selector('#booklist > dd:nth-child(%d) > a'%(i+1))
book_href = book.get_attribute('href')
book_text = book.text
tampermonkey
header模板
// ==UserScript==
// @name TimeBooking
// @namespace http://tampermonkey.net/
// @version 0.1
// @description try to take over the world!
// @author You
// @match https://www.citibank.com.hk/*
// @match https://www.services.online-banking.hsbc.com.hk/*
// @match https://e-banking1.hangseng.com/*
// @match https://ebsnew.boc.cn/*
// @match https://its.bochk.com/cdc.overview.do
// @grant GM_xmlhttpRequest
// ==/UserScript==
(function() {
'use strict';
})();
xpath用法模板
var xpath='//textarea';
var tags_data_image =document.evaluate(xpath, document, null, XPathResult.ANY_TYPE,null);
var textareavalue='';
var tags=[];
for(var tag=tags_data_image.iterateNext(); tag; tag=tags_data_image.iterateNext())
{
tags.push(tag);
if(tag.value.length>0) {
textareavalue = tag.value;
}
}
function _x(STR_XPATH) {
var xresult = document.evaluate(STR_XPATH, document, null, XPathResult.ANY_TYPE, null);
var xnodes = [];
var xres;
while (xres = xresult.iterateNext()) {
xnodes.push(xres);
}
return xnodes;
}
$(_x('/html/.//div[@id="text"]')).attr('id', 'modified-text');
Logging
console.log(...)
current url
var currentLocation = window.location;
currentLocation.host
Find substring
s.indexOf('citibank')
JSON to string
var sFinal = {"value1":s2, "value2":today};
alert(JSON.stringify(sFinal));
send JSON request
GM_xmlhttpRequest ( {
method: "POST",
url: 'https://maker.ifttt.com/trigger/bankmoney/with/key/feQcXd0QuePnJb23E97bv',
data: JSON.stringify(sFinal),
headers: {
"Content-Type": "application/json"
},
onload: function (response) {
console.log ("gut response " + response);
alert("Success " + response);
}
} );
insert a node after a node
function insertAfter(newElement,targetElement) {
//target is what you want it to go after. Look for this elements parent.
var parent = targetElement.parentNode;
//if the parents lastchild is the targetElement...
if(parent.lastchild == targetElement) {
//add the newElement after the target element.
parent.appendChild(newElement);
} else {
// else the target has siblings, insert the new element between the target and it's next sibling.
parent.insertBefore(newElement, targetElement.nextSibling);
}
}
create a html node
var btn = document.createElement("a");
btn.innerText='DoMyTask!';
btn.addEventListener("click",updateAll);
btn.setAttribute('color','red');
stop on debugger
debugger;
when windoes loaded, call a function
window.addEventListener('load', function() {
var checkExist = setInterval(function() {
if (getRefObject()!=null) {
console.log("Exists!");
main();
clearInterval(checkExist);
}
}, 1000); // check every 100ms
}, false);
css selector
var matches = document.querySelectorAll(".adaver_box, #div-ad-top, #adHeaderTop, #adFlashLink, *[id^='adRectangle'], #adTextLink, *[id^='divSkyscraper'], *[id^='div-ad-'], *[id^='google_ads'], .anv-ad-content");
matches.forEach(function(element){
element.parentNode.removeChild(element);
});
Use jquery
// @require http://ajax.googleapis.com/ajax/libs/jquery/1.8.3/jquery.min.js
$('a[onmousedown^="return rwt("]').removeAttr('onmousedown');
$('li.action-menu-item.ab_dropdownitem a[href^="http://webcache.googleusercontent."]').each(
function() {
$(this).closest('div.action-menu.ab_ctl').after(' ').after($(this))
}
)
如何分析网页
Chrome浏览器“检查”功能
步骤一:打开“检查”功能。用Chrome浏览器打开Hello World文章。右击页面的任意位置,在弹出的快弹菜单中单击“检查”命令,得到如图4-5所示的页面窗口。
步骤二:找到真实的数据地址。单击页面中的Network选项,然后刷新网页。此时,Network会显示浏览器从网页服务器中得到的所有文件,一般这个过程称为“抓包”。因为所有文件已经显示出来了,所以需要的评论数据一定在其中。
一般而言,这些数据可能以 json 文件格式获取。我们可以在Network中的 All找到真正的评论文件“list?callback=jQuery11240879907919223679”。点击 Preview 即可查看数据,如图4-6所示。
步骤三:爬取真实评论数据地址。既然找到了真实的地址,接下来就可以直接用requests请求这个地址获取数据了,代码如下:
import requests
link = """https://api-zero.livere.com/v1/comments/
list?callback=jQuery1124049866736766120545_
1506309304525&limit=10&offset=1&repSeq=3871836
&requestPath=%2Fv1%2Fcomments%2Flist
&consumerSeq=1020&livereSeq=28583
&smartloginSeq=5154&_=1506309304527"""
headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
r = requests.get(link, headers= headers)
print (r.text)
Links
https://zhuanlan.zhihu.com/p/31127887 requests
https://zhuanlan.zhihu.com/p/31127896 seleum
https://zhuanlan.zhihu.com/p/73742321
https://www.jianshu.com/p/beb200cda628 seleum