以下程序使用python2.7

requests模块例子

  1. #coding=utf-8
  2. import requests;
  3. url = "http://www.baidu.com";
  4. r = requests.get(url);
  5. # print r.content;
  6. # print r.text;
  7. # print r.status_code;
  8. # print r.headers;
  9. print r.cookies;

hackhttp模块例子

testHackHttp1

如果使用的是python2.7.8版本会出现’module’ object has no attribute ‘_create_unverified_context’报错信息
image.png
解决:升级版本
image.png

  1. # coding=utf-8
  2. import hackhttp
  3. # 创建一个hackhttp对象
  4. hh = hackhttp.hackhttp()
  5. # 定义url
  6. url = "http://www.baidu.com"
  7. # 发出一个简单的请求
  8. try:
  9. code,head,html,redirect_url,log = hh.http(url)
  10. except Exception as e:
  11. print(e)
  12. # 状态码
  13. print code
  14. # http头
  15. print head
  16. # html内容
  17. print html

testHackHttp2

  1. # coding=utf-8
  2. import hackhttp
  3. hh = hackhttp.hackhttp()
  4. # 可以指定第二个参数,post提交的数据
  5. code, head, body, redirect, log = hh.http('http://httpbin.org/post', post="key1=val1&key2=val2")
  6. print body

testHackHttp2

  1. # coding=utf-8
  2. import hackhttp
  3. hh = hackhttp.hackhttp()
  4. # 可以指定第二个参数,post提交的数据
  5. code, head, body, redirect, log = hh.http('http://httpbin.org/post', post="key1=val1&key2=val2")
  6. print body

testHackHttp3

  1. # coding=utf-8
  2. import hackhttp
  3. raw_content = '''
  4. POST /post HTTP/1.1
  5. Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
  6. Accept-Language: zh-CN
  7. Upgrade-Insecure-Requests: 1
  8. User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763
  9. Accept-Encoding: gzip, deflate
  10. Host: httpbin.org
  11. Connection: close
  12. '''
  13. raw='''POST /post HTTP/1.1
  14. Host: httpbin.org
  15. User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:45.0) Gecko/20100101 Firefox/45.0
  16. Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
  17. Accept-Language: zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3
  18. Accept-Encoding: gzip, deflate
  19. Connection: close
  20. Content-Type: application/x-www-form-urlencoded
  21. Content-Length: 19
  22. key1=val1&key2=val2'''
  23. hh = hackhttp.hackhttp()
  24. code, head, html, redirect, log = hh.http('http://httpbin.org/post', raw=raw)
  25. print html;

testHackHttp4

  1. # coding=utf-8
  2. import hackhttp
  3. # 自定义请求头
  4. hh = hackhttp.hackhttp()
  5. headers_dict = {
  6. 'X-Forwarder-For': 'https://q.bugscan.net',
  7. 'Hack-Http': 'Header Dict Val'
  8. }
  9. header_str='HH_HEADER_1: hh h1 val\r\nHH_HEADER_2:hh h2 val'
  10. code, head, body, redirect, log = hh.http('https://www.baidu.com', headers=header_str)
  11. print log['request']

testHackHHttp5

  1. import hackhttp
  2. hh = hackhttp.hackhttp()
  3. proxy_str = ('127.0.0.1', 8080)
  4. code, head, body, redirect, log = hh.http('http://httpbin.org/post', post="key1=val1&key2=val2", proxy=proxy_str)
  5. # code, head, body, redirect, log = hh.http('http://www.baidu.com', proxy=proxy_str)
  6. print code;

BeautifulSoup模块例子

testBeautifulSoup1

  1. #coding=utf-8
  2. from bs4 import BeautifulSoup
  3. # https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
  4. html_doc = """
  5. <html><head><title>The Dormouse's story</title></head>
  6. <body>
  7. <p class="title"><b>The Dormouse's story</b></p>
  8. <p class="story">Once upon a time there were three little sisters; and their names were
  9. <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
  10. <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
  11. <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
  12. and they lived at the bottom of a well.</p>
  13. <p class="story">...</p>
  14. """
  15. # 使用BS库解析html文档,返回soup对象
  16. #soup = BeautifulSoup(html_doc)
  17. #soup = BeautifulSoup(html_doc,"html.parser")
  18. soup = BeautifulSoup(html_doc,"lxml")
  19. #print(soup)
  20. # 格式化 html文档
  21. # print(soup.prettify())
  22. # 几个简单的浏览结构化数据的方法
  23. # 打印title标签
  24. # print(soup.title)
  25. # <title>The Dormouse's story</title>
  26. # 打印title标签名
  27. # print(soup.title.name)
  28. # u'title'
  29. # 打印title标签中的内容
  30. #print(soup.title.string)
  31. # u'The Dormouse's story'
  32. # 打印title标签的父标签的名字
  33. #print(soup.title.parent.name)
  34. # u'head'
  35. # 打印p标签里的所有内容
  36. # print(soup.p)
  37. # <p class="title"><b>The Dormouse's story</b></p>
  38. # 打印p标签的class属性值
  39. # print(soup.p['class']) # 显示第一个p标签一个值
  40. # u'title'
  41. # 打印a标签的内容
  42. #print(soup.a)
  43. # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
  44. # 获取所有的a标签
  45. #print(soup.find_all('a'))
  46. # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
  47. # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
  48. # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
  49. # 获取id为link3的标签
  50. print(soup.find(id="link3"))
  51. # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

testBeautifulSoup2

  1. #coding=utf-8
  2. from bs4 import BeautifulSoup
  3. html = """
  4. <html>
  5. <head>
  6. <title>The Dormouse's story</title>
  7. </head>
  8. <body>
  9. <p class="story">
  10. Once upon a time there were three little sisters; and their names were
  11. <a href="http://example.com/elsie" class="sister" id="link1">
  12. <span>Elsie</span>
  13. </a>
  14. <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
  15. and
  16. <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
  17. and they lived at the bottom of a well.
  18. </p>
  19. <p class="story">...</p>
  20. """
  21. soup = BeautifulSoup(html,'lxml')
  22. # 获取一个标签下的子标签
  23. #将p标签下的所有子标签存入到了一个列表中
  24. #print(soup.p.contents)
  25. # 遍历输出
  26. for c in range(len(soup.p.contents)):
  27. print soup.p.contents[c];

testBeautifulSoup3

  1. #coding=utf-8
  2. from bs4 import BeautifulSoup
  3. html='''
  4. <div class="panel">
  5. <div class="panel-heading">
  6. <h4>Hello</h4>
  7. </div>
  8. <div class="panel-body">
  9. <ul class="list" id="list-1">
  10. <li class="element">Foo</li>
  11. <li class="element">Bar</li>
  12. <li class="element">Jay</li>
  13. </ul>
  14. <ul class="list list-small" id="list-2">
  15. <li class="element">Foo</li>
  16. <li class="element">Bar</li>
  17. </ul>
  18. </div>
  19. </div>
  20. '''
  21. soup = BeautifulSoup(html, 'lxml')
  22. #查找所有的ul标签内容
  23. #print(soup.find_all('ul'))
  24. #print(type(soup.find_all('ul')[0])) #查找ul标签的类型
  25. for ul in soup.find_all('ul'):
  26. print(ul.find_all('li')) #针对结果再次find_all,从而获取所有的li标签信息
  27. '''
  28. find_all(name,attrs,recursive,text,**kwargs)
  29. 可以根据标签名,属性,内容查找文档
  30. '''

testBeautifulSoup4

  1. #coding=utf-8
  2. from bs4 import BeautifulSoup
  3. html='''
  4. <div class="panel">
  5. <div class="panel-heading">
  6. <h4>Hello</h4>
  7. </div>
  8. <div class="panel-body">
  9. <ul class="list" id="list-1" name="elements">
  10. <li class="element">Foo</li>
  11. <li class="element">Bar</li>
  12. <li class="element">Jay</li>
  13. </ul>
  14. <ul class="list list-small" id="list-2">
  15. <li class="element">Foo</li>
  16. <li class="element">Bar</li>
  17. </ul>
  18. </div>
  19. </div>
  20. '''
  21. soup = BeautifulSoup(html, 'lxml')
  22. print(soup.find_all(attrs={'id': 'list-1'})) # 查找id为list-1的内容
  23. #print(soup.find_all(attrs={'name': 'elements'})) # 查找name为elements的内容
  24. #print(soup.find_all(attrs={'class':'list'}));

testBeautifulSoup5

  1. #coding=utf-8
  2. from bs4 import BeautifulSoup
  3. html='''
  4. <div class="panel">
  5. <div class="panel-heading">
  6. <h4>Hello</h4>
  7. </div>
  8. <div class="panel-body">
  9. <ul class="list" id="list-1">
  10. <li class="element">Foo</li>
  11. <li class="element">Bar</li>
  12. <li class="element">Jay</li>
  13. </ul>
  14. <ul class="list list-small" id="list-2">
  15. <li class="element">Foo</li>
  16. <li class="element">Bar</li>
  17. </ul>
  18. </div>
  19. </div>
  20. '''
  21. soup = BeautifulSoup(html, 'lxml')
  22. print(soup.find_all(text='Foo')) #查找所有的text='Foo'的文本

testBeautifulSoup6

  1. #coding=utf-8
  2. from bs4 import BeautifulSoup
  3. html='''
  4. <div class="panel">
  5. <div class="panel-heading">
  6. <h4>Hello</h4>
  7. </div>
  8. <div class="panel-body">
  9. <ul class="list" id="list-1">
  10. <li class="element">Foo</li>
  11. <li class="element">Bar</li>
  12. <li class="element">Jay</li>
  13. </ul>
  14. <ul class="list list-small" id="list-2">
  15. <li class="element">Foo</li>
  16. <li class="element">Bar</li>
  17. </ul>
  18. </div>
  19. </div>
  20. '''
  21. soup = BeautifulSoup(html, 'lxml')
  22. #print(soup.select('.panel .panel-heading')) #获取class名为panel和panel-heading的内容
  23. #print(soup.select('ul li')) #获取标签名为ul和li的内容
  24. #print(soup.select('#list-2 .element')) #获取class名为element,id为list-2的内容
  25. print(type(soup.select('ul')[0])) #获取class名为ul的类型

testBeautifulSoup7

  1. #coding=utf-8
  2. from bs4 import BeautifulSoup
  3. html='''
  4. <div class="panel">
  5. <div class="panel-heading">
  6. <h4>Hello</h4>
  7. </div>
  8. <div class="panel-body">
  9. <ul class="list" id="list-1">
  10. <li class="element">Foo</li>
  11. <li class="element">Bar</li>
  12. <li class="element">Jay</li>
  13. </ul>
  14. <ul class="list list-small" id="list-2">
  15. <li class="element">Foo</li>
  16. <li class="element">Bar</li>
  17. </ul>
  18. </div>
  19. </div>
  20. '''
  21. soup = BeautifulSoup(html, 'lxml')
  22. # 获取li标签
  23. for li in soup.select('li'):
  24. print(li.get_text())
  25. #使用get_text()获取文本内容

testBeautifulSoup8

  1. #coding=utf-8
  2. from bs4 import BeautifulSoup
  3. html='''
  4. <div class="panel">
  5. <div class="panel-heading">
  6. <h4>Hello</h4>
  7. </div>
  8. <div class="panel-body">
  9. <ul class="list" id="list-1">
  10. <li class="element">Foo</li>
  11. <li class="element">Bar</li>
  12. <li class="element">Jay</li>
  13. </ul>
  14. <ul class="list list-small" id="list-2">
  15. <li class="element">Foo</li>
  16. <li class="element">Bar</li>
  17. </ul>
  18. </div>
  19. </div>
  20. '''
  21. soup = BeautifulSoup(html, 'lxml')
  22. for ul in soup.select('ul'):
  23. print(ul['id'])
  24. print(ul.attrs['id'])
  25. #获取属性的时候可以通过[属性名]或者attrs[属性名]

re模块例子

  1. #coding:utf-8;
  2. # 1. 正则表达式
  3. # 正则表达式是一个特殊的字符序列,它能帮助你方便的检查一个字符串是否与某种模式匹配
  4. # 2. re模块
  5. # compile(模式) 创建模式对象
  6. # search(模式,字符串) 在字符串中寻找模式
  7. # match(模式,字符串) 在字符串开始处匹配模式
  8. # findall(模式,字符串) 列表形式返回匹配项
  9. # sub(模式,替换字符,字符串) 替换字符串中的匹配项
  10. # split(模式) 根据模式分割字符串
  11. import re
  12. # 6个函数的使用
  13. #模式1 普通字符串, 匹配字符串a
  14. a = re.compile('abc');
  15. print("模式对象:" + str(a))
  16. b = a.search('abdabcbabab');
  17. print('匹配到的字符串信息:'+ str(b))
  18. c = a.match('abdabcbabab');
  19. print("第一个匹配的字符串:" + str(c))
  20. d = re.findall('abc', 'abdabcbabcab');
  21. print("匹配到的字符列表:" + str(d))
  22. e = re.sub('abc','aaa','abdabcbabcab');
  23. print("匹配到的字符列表:" + str(e))
  24. f = re.split(',', 'ab,bc,cd,de');
  25. print("分割字符:" + str(f))
  26. # 正则表达式匹配修饰符
  27. # re.I 使大小写不敏感
  28. # re.L 做本地化失败匹配
  29. # re.M 多行匹配,影响^和$
  30. # re.S 使.匹配包括换行在内的所有字符
  31. # re.U 根据Unicode字符集解析字符,影响\w,\W,\b,\B
  32. # re.X 该标志通过给予你更灵活的格式以便你将正则表达式写得更易于理解。
  33. # 对大小写不敏感
  34. aa = re.search('abc', 'ABCc1B3ABC2',re.I);
  35. print("大小写不敏感:" + str(aa))
  36. # ^ 匹配字符串的开头
  37. # $ 匹配字符串的结尾
  38. # . 匹配任意字符,除了换行
  39. # [...] 用来表示一组字符,单独列出:[amk] 匹配 'a','m','k'
  40. # [^...] 不在[]中的字符,[^abc] 匹配除了a、b、c之外的字符
  41. # re* 匹配0个或多个表达式
  42. # re+ 匹配1个或多个表达式
  43. # re? 匹配0个或1个由前面的正则表达式定义的片段,非贪婪模式
  44. # re{n} 精确匹配n个前面的表达式,例如o{2},不能匹配“Bob”中的'o',但能匹配'food'中的两个o
  45. # re{n,} 匹配n个前面表达式。例如 o{2,} 不能匹配“Bob”中的'o',但能匹配'foooood'中的所有o,o{1,}等价于o+, o{0,}等价于o*
  46. # re{n,m} 匹配n到m此由前面的正则表达式定义的片段,贪婪模式
  47. # ^ 匹配字符串的开头
  48. bb = re.findall('^abc', 'ABCc1B3abc2');
  49. print("匹配开头:" + str(bb))
  50. # $ 匹配字符串的结尾
  51. cc = re.findall('abc$', 'ABCc1B3abc');
  52. print("匹配开头:" + str(cc))
  53. # . 匹配任意字符,除了换行
  54. dd = re.findall('.', 'ABCc1B3abc2');
  55. print("匹配任意字符: " +str(dd))
  56. # 匹配一组单个字符,匹配a,b,c
  57. ee = re.findall('[abc]', 'ABCc1B3abc');
  58. print("匹配一组单个字符:" + str(ee))
  59. # 匹配不在[]的单个字符
  60. ee = re.findall('[^abc]', 'ABCc1B3abc');
  61. print("匹配不在[]的单个字符:" + str(ee))
  62. # 前面的字符匹配0个或多个字符
  63. ff = re.findall('1*', 'ABCc1B3abc');
  64. print("匹配0个或多个字符:" + str(ff))
  65. # 前面的字符匹配1个或多个字符
  66. ff = re.findall('1+', 'ABCc1B3abc');
  67. print("匹配1个或多个字符:" + str(ff))
  68. # 前面的字符匹配0个或1个字符,非贪婪模式
  69. ff = re.findall('1B3?', 'ABCc1B3abc');
  70. print("匹配0个或1个字符:" + str(ff))
  71. # re{n} 精确匹配n个前面的表达式
  72. ff = re.findall('c{2}', 'ABCcc1B3abc');
  73. print("精确匹配n个前面的表达式:" + str(ff))
  74. # re{n,} 匹配n个前面表达式
  75. ff = re.findall('c{2,}', 'ABcCcc1Bccccc3abc');
  76. print("匹配n个前面表达式:" + str(ff))
  77. # re{n,m} 匹配n到m此由前面的正则表达式定义的片段,贪婪模式
  78. ff = re.findall('c{2,4}', 'ABcCcc1Bccc3abc');
  79. print("匹配n到m此由前面的正则表达式定义的片段:" + str(ff))
  80. # \d 匹配一个数字字符 等价于[0-9]
  81. # \D 匹配一个非数字字符 等价于 [^0-9]
  82. # \s 匹配任何空白字符,包括空格、制表符、换页符等等,等价于[\f\n\r\t\v]
  83. # \S 匹配任何非空白哦字符,等价于[^\f\n\r\t\v]
  84. # \w 匹配包括下划线的任何单词字符,等价于[A-Za-z9-0_]
  85. # \W 匹配任何非单词字符。等价于[^A-Za-z9-0_]
  86. # \d 匹配一个数字字符 等价于[0-9]
  87. ff = re.findall('\d', 'ABcCcc1Bccc3abc');
  88. print("匹配一个数字字符 :" + str(ff))
  89. # \D 匹配一个非数字字符 等价于 [^0-9]
  90. ff = re.findall('\D', '123c1Bc3abc');
  91. print("匹配一个非数字字符 :" + str(ff))
  92. # \s 匹配任何空白字符,包括空格、制表符、换页符等等,等价于[\f\n\r\t\v]
  93. ff = re.findall('\s', '123 c1Bc 3abc');
  94. print("匹配任何空白字符 :" + str(ff))
  95. # \S 匹配任何非空白哦字符,等价于[^\f\n\r\t\v]
  96. ff = re.findall('\S', '1 c2c 3abc');
  97. print("匹配任何非空白字符 :" + str(ff))
  98. # \w 匹配包括下划线的任何单词字符,等价于[A-Za-z9-0_]
  99. ff = re.findall('\w', '1 c_A2c 3abc');
  100. print("匹配包括下划线的任何单词字符 :" + str(ff))
  101. # \W 匹配任何非单词字符。等价于[^A-Za-z9-0_]
  102. ff = re.findall('\W', '1 c_2c 3abc');
  103. print("匹配任何非单词字符 :" + str(ff))
  104. # 贪婪模式和非贪婪模式
  105. # 正则表达式通畅常用于在文本中查找匹配的字符串。python里的数量词默认是贪婪的,
  106. # 意思就是总是尝试匹配尽可能多的字符
  107. # 非贪婪模式则相反,总是尝试匹配尽可能少的字符
  108. # 前面的字符匹配0个或1个字符,非贪婪模式,也是可以匹配到的
  109. ff = re.findall('1B3?', 'ABCc1Bx3abc');
  110. print("匹配0个或1个字符:" + str(ff))