requests模块例子
#coding=utf-8
import requests;
url = "http://www.baidu.com";
r = requests.get(url);
# print r.content;
# print r.text;
# print r.status_code;
# print r.headers;
print r.cookies;
hackhttp模块例子
testHackHttp1
如果使用的是python2.7.8版本会出现’module’ object has no attribute ‘_create_unverified_context’报错信息
解决:升级版本
# coding=utf-8
import hackhttp
# 创建一个hackhttp对象
hh = hackhttp.hackhttp()
# 定义url
url = "http://www.baidu.com"
# 发出一个简单的请求
try:
code,head,html,redirect_url,log = hh.http(url)
except Exception as e:
print(e)
# 状态码
print code
# http头
print head
# html内容
print html
testHackHttp2
# coding=utf-8
import hackhttp
hh = hackhttp.hackhttp()
# 可以指定第二个参数,post提交的数据
code, head, body, redirect, log = hh.http('http://httpbin.org/post', post="key1=val1&key2=val2")
print body
testHackHttp2
# coding=utf-8
import hackhttp
hh = hackhttp.hackhttp()
# 可以指定第二个参数,post提交的数据
code, head, body, redirect, log = hh.http('http://httpbin.org/post', post="key1=val1&key2=val2")
print body
testHackHttp3
# coding=utf-8
import hackhttp
raw_content = '''
POST /post HTTP/1.1
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Accept-Language: zh-CN
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763
Accept-Encoding: gzip, deflate
Host: httpbin.org
Connection: close
'''
raw='''POST /post HTTP/1.1
Host: httpbin.org
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:45.0) Gecko/20100101 Firefox/45.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Accept-Language: zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3
Accept-Encoding: gzip, deflate
Connection: close
Content-Type: application/x-www-form-urlencoded
Content-Length: 19
key1=val1&key2=val2'''
hh = hackhttp.hackhttp()
code, head, html, redirect, log = hh.http('http://httpbin.org/post', raw=raw)
print html;
testHackHttp4
# coding=utf-8
import hackhttp
# 自定义请求头
hh = hackhttp.hackhttp()
headers_dict = {
'X-Forwarder-For': 'https://q.bugscan.net',
'Hack-Http': 'Header Dict Val'
}
header_str='HH_HEADER_1: hh h1 val\r\nHH_HEADER_2:hh h2 val'
code, head, body, redirect, log = hh.http('https://www.baidu.com', headers=header_str)
print log['request']
testHackHHttp5
import hackhttp
hh = hackhttp.hackhttp()
proxy_str = ('127.0.0.1', 8080)
code, head, body, redirect, log = hh.http('http://httpbin.org/post', post="key1=val1&key2=val2", proxy=proxy_str)
# code, head, body, redirect, log = hh.http('http://www.baidu.com', proxy=proxy_str)
print code;
BeautifulSoup模块例子
testBeautifulSoup1
#coding=utf-8
from bs4 import BeautifulSoup
# https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 使用BS库解析html文档,返回soup对象
#soup = BeautifulSoup(html_doc)
#soup = BeautifulSoup(html_doc,"html.parser")
soup = BeautifulSoup(html_doc,"lxml")
#print(soup)
# 格式化 html文档
# print(soup.prettify())
# 几个简单的浏览结构化数据的方法
# 打印title标签
# print(soup.title)
# <title>The Dormouse's story</title>
# 打印title标签名
# print(soup.title.name)
# u'title'
# 打印title标签中的内容
#print(soup.title.string)
# u'The Dormouse's story'
# 打印title标签的父标签的名字
#print(soup.title.parent.name)
# u'head'
# 打印p标签里的所有内容
# print(soup.p)
# <p class="title"><b>The Dormouse's story</b></p>
# 打印p标签的class属性值
# print(soup.p['class']) # 显示第一个p标签一个值
# u'title'
# 打印a标签的内容
#print(soup.a)
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
# 获取所有的a标签
#print(soup.find_all('a'))
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
# 获取id为link3的标签
print(soup.find(id="link3"))
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
testBeautifulSoup2
#coding=utf-8
from bs4 import BeautifulSoup
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html,'lxml')
# 获取一个标签下的子标签
#将p标签下的所有子标签存入到了一个列表中
#print(soup.p.contents)
# 遍历输出
for c in range(len(soup.p.contents)):
print soup.p.contents[c];
testBeautifulSoup3
#coding=utf-8
from bs4 import BeautifulSoup
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
#查找所有的ul标签内容
#print(soup.find_all('ul'))
#print(type(soup.find_all('ul')[0])) #查找ul标签的类型
for ul in soup.find_all('ul'):
print(ul.find_all('li')) #针对结果再次find_all,从而获取所有的li标签信息
'''
find_all(name,attrs,recursive,text,**kwargs)
可以根据标签名,属性,内容查找文档
'''
testBeautifulSoup4
#coding=utf-8
from bs4 import BeautifulSoup
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(attrs={'id': 'list-1'})) # 查找id为list-1的内容
#print(soup.find_all(attrs={'name': 'elements'})) # 查找name为elements的内容
#print(soup.find_all(attrs={'class':'list'}));
testBeautifulSoup5
#coding=utf-8
from bs4 import BeautifulSoup
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(text='Foo')) #查找所有的text='Foo'的文本
testBeautifulSoup6
#coding=utf-8
from bs4 import BeautifulSoup
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
#print(soup.select('.panel .panel-heading')) #获取class名为panel和panel-heading的内容
#print(soup.select('ul li')) #获取标签名为ul和li的内容
#print(soup.select('#list-2 .element')) #获取class名为element,id为list-2的内容
print(type(soup.select('ul')[0])) #获取class名为ul的类型
testBeautifulSoup7
#coding=utf-8
from bs4 import BeautifulSoup
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
# 获取li标签
for li in soup.select('li'):
print(li.get_text())
#使用get_text()获取文本内容
testBeautifulSoup8
#coding=utf-8
from bs4 import BeautifulSoup
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
for ul in soup.select('ul'):
print(ul['id'])
print(ul.attrs['id'])
#获取属性的时候可以通过[属性名]或者attrs[属性名]
re模块例子
#coding:utf-8;
# 1. 正则表达式
# 正则表达式是一个特殊的字符序列,它能帮助你方便的检查一个字符串是否与某种模式匹配
# 2. re模块
# compile(模式) 创建模式对象
# search(模式,字符串) 在字符串中寻找模式
# match(模式,字符串) 在字符串开始处匹配模式
# findall(模式,字符串) 列表形式返回匹配项
# sub(模式,替换字符,字符串) 替换字符串中的匹配项
# split(模式) 根据模式分割字符串
import re
# 6个函数的使用
#模式1 普通字符串, 匹配字符串a
a = re.compile('abc');
print("模式对象:" + str(a))
b = a.search('abdabcbabab');
print('匹配到的字符串信息:'+ str(b))
c = a.match('abdabcbabab');
print("第一个匹配的字符串:" + str(c))
d = re.findall('abc', 'abdabcbabcab');
print("匹配到的字符列表:" + str(d))
e = re.sub('abc','aaa','abdabcbabcab');
print("匹配到的字符列表:" + str(e))
f = re.split(',', 'ab,bc,cd,de');
print("分割字符:" + str(f))
# 正则表达式匹配修饰符
# re.I 使大小写不敏感
# re.L 做本地化失败匹配
# re.M 多行匹配,影响^和$
# re.S 使.匹配包括换行在内的所有字符
# re.U 根据Unicode字符集解析字符,影响\w,\W,\b,\B
# re.X 该标志通过给予你更灵活的格式以便你将正则表达式写得更易于理解。
# 对大小写不敏感
aa = re.search('abc', 'ABCc1B3ABC2',re.I);
print("大小写不敏感:" + str(aa))
# ^ 匹配字符串的开头
# $ 匹配字符串的结尾
# . 匹配任意字符,除了换行
# [...] 用来表示一组字符,单独列出:[amk] 匹配 'a','m','k'
# [^...] 不在[]中的字符,[^abc] 匹配除了a、b、c之外的字符
# re* 匹配0个或多个表达式
# re+ 匹配1个或多个表达式
# re? 匹配0个或1个由前面的正则表达式定义的片段,非贪婪模式
# re{n} 精确匹配n个前面的表达式,例如o{2},不能匹配“Bob”中的'o',但能匹配'food'中的两个o
# re{n,} 匹配n个前面表达式。例如 o{2,} 不能匹配“Bob”中的'o',但能匹配'foooood'中的所有o,o{1,}等价于o+, o{0,}等价于o*
# re{n,m} 匹配n到m此由前面的正则表达式定义的片段,贪婪模式
# ^ 匹配字符串的开头
bb = re.findall('^abc', 'ABCc1B3abc2');
print("匹配开头:" + str(bb))
# $ 匹配字符串的结尾
cc = re.findall('abc$', 'ABCc1B3abc');
print("匹配开头:" + str(cc))
# . 匹配任意字符,除了换行
dd = re.findall('.', 'ABCc1B3abc2');
print("匹配任意字符: " +str(dd))
# 匹配一组单个字符,匹配a,b,c
ee = re.findall('[abc]', 'ABCc1B3abc');
print("匹配一组单个字符:" + str(ee))
# 匹配不在[]的单个字符
ee = re.findall('[^abc]', 'ABCc1B3abc');
print("匹配不在[]的单个字符:" + str(ee))
# 前面的字符匹配0个或多个字符
ff = re.findall('1*', 'ABCc1B3abc');
print("匹配0个或多个字符:" + str(ff))
# 前面的字符匹配1个或多个字符
ff = re.findall('1+', 'ABCc1B3abc');
print("匹配1个或多个字符:" + str(ff))
# 前面的字符匹配0个或1个字符,非贪婪模式
ff = re.findall('1B3?', 'ABCc1B3abc');
print("匹配0个或1个字符:" + str(ff))
# re{n} 精确匹配n个前面的表达式
ff = re.findall('c{2}', 'ABCcc1B3abc');
print("精确匹配n个前面的表达式:" + str(ff))
# re{n,} 匹配n个前面表达式
ff = re.findall('c{2,}', 'ABcCcc1Bccccc3abc');
print("匹配n个前面表达式:" + str(ff))
# re{n,m} 匹配n到m此由前面的正则表达式定义的片段,贪婪模式
ff = re.findall('c{2,4}', 'ABcCcc1Bccc3abc');
print("匹配n到m此由前面的正则表达式定义的片段:" + str(ff))
# \d 匹配一个数字字符 等价于[0-9]
# \D 匹配一个非数字字符 等价于 [^0-9]
# \s 匹配任何空白字符,包括空格、制表符、换页符等等,等价于[\f\n\r\t\v]
# \S 匹配任何非空白哦字符,等价于[^\f\n\r\t\v]
# \w 匹配包括下划线的任何单词字符,等价于[A-Za-z9-0_]
# \W 匹配任何非单词字符。等价于[^A-Za-z9-0_]
# \d 匹配一个数字字符 等价于[0-9]
ff = re.findall('\d', 'ABcCcc1Bccc3abc');
print("匹配一个数字字符 :" + str(ff))
# \D 匹配一个非数字字符 等价于 [^0-9]
ff = re.findall('\D', '123c1Bc3abc');
print("匹配一个非数字字符 :" + str(ff))
# \s 匹配任何空白字符,包括空格、制表符、换页符等等,等价于[\f\n\r\t\v]
ff = re.findall('\s', '123 c1Bc 3abc');
print("匹配任何空白字符 :" + str(ff))
# \S 匹配任何非空白哦字符,等价于[^\f\n\r\t\v]
ff = re.findall('\S', '1 c2c 3abc');
print("匹配任何非空白字符 :" + str(ff))
# \w 匹配包括下划线的任何单词字符,等价于[A-Za-z9-0_]
ff = re.findall('\w', '1 c_A2c 3abc');
print("匹配包括下划线的任何单词字符 :" + str(ff))
# \W 匹配任何非单词字符。等价于[^A-Za-z9-0_]
ff = re.findall('\W', '1 c_2c 3abc');
print("匹配任何非单词字符 :" + str(ff))
# 贪婪模式和非贪婪模式
# 正则表达式通畅常用于在文本中查找匹配的字符串。python里的数量词默认是贪婪的,
# 意思就是总是尝试匹配尽可能多的字符
# 非贪婪模式则相反,总是尝试匹配尽可能少的字符
# 前面的字符匹配0个或1个字符,非贪婪模式,也是可以匹配到的
ff = re.findall('1B3?', 'ABCc1Bx3abc');
print("匹配0个或1个字符:" + str(ff))