https://www.qdnd.vn/

  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. # Created on 2021-04-14 09:22:11
  4. # Project: nhanda
  5. from pyspider.libs.base_handler import *
  6. import sqlite3, re, os
  7. import datetime
  8. import pymysql
  9. import hashlib
  10. # 图片保存路径
  11. website_url = 'https://www.qdnd.vn/'
  12. DIR_PATH = "image/vietNam_www_qdnd_vn/"
  13. # http://biengioilanhtho.gov.vn/vi/chuyen-muc/thoi-su.html
  14. plate_list = ['chinh-tri',
  15. 'quoc-phong-an-ninh',
  16. 'quan-su-the-gioi',
  17. 'thoi-su-quoc-te',
  18. ]
  19. module_list = ['chinh-tri',
  20. 'quoc-phong-an-ninh',
  21. 'quan-su-the-gioi',
  22. 'thoi-su-quoc-te',
  23. ]
  24. conn = pymysql.connect(
  25. host='192.168.1.100',
  26. user='root',
  27. password='1qazXSW@',
  28. db='pyspider',
  29. charset='utf8',
  30. )
  31. crawl_person = 'dsw'
  32. class Handler(BaseHandler):
  33. crawl_config = {
  34. 'itag': 'v10'
  35. }
  36. def __init__(self):
  37. self.dir_path = DIR_PATH
  38. self.tool = Tool()
  39. self.google = Google()
  40. @every(minutes=24 * 60)
  41. def on_start(self):
  42. # 获取新闻板块
  43. for module in module_list:
  44. url = website_url + module
  45. self.crawl(url, callback=self.index_page, validate_cert=False,timeout=300)
  46. @config(age=10 * 24 * 60 * 60)
  47. def index_page(self, response):
  48. # 获取新闻子版块
  49. url = response.url
  50. print('index_page...',url)
  51. crawl_url = '.main-menu a[href^="' + url + '"]'
  52. for each in response.doc(crawl_url).items():
  53. print( each.attr.href)
  54. self.crawl(each.attr.href, callback=self.index_page1, validate_cert=False,timeout=300)
  55. @config(age=10 * 24 * 60 * 60)
  56. def index_page1(self, response):
  57. # 获取子版块分页信息及文章链接
  58. url = response.url
  59. print(url)
  60. page_url_re = r'.ex_page a[title^="Trang cuối"]'
  61. laest_page = response.doc(page_url_re).attr.href.split('/')[-1]
  62. print(laest_page)
  63. for each in range(int(laest_page)):
  64. self.crawl(url+'/p1/p/'+str(each+1), callback=self.get_detail_page, validate_cert=False,timeout=300)
  65. @config(priority=2)
  66. def get_detail_page(self, response):
  67. url = response.url
  68. fiter_url = url.split('/p1/p/')[0]
  69. print(fiter_url)
  70. for ee in response.doc('.content-list .row a[href^="'+fiter_url+'"]').items():
  71. detail_page_url = ee.attr.href
  72. print(detail_page_url)
  73. if len(detail_page_url.split('/p/'))==1:
  74. self.crawl(detail_page_url, callback=self.detail_page, validate_cert=False,timeout=300)
  75. @config(priority=2)
  76. def detail_page(self, response):
  77. url = response.url
  78. title = response.doc('title').text()
  79. content = response.doc('.post-content [itemprop="articleBody"]').text()
  80. content_cn = self.google.translate('vi', 'zh-CN', content)
  81. date = response.doc('.post-subinfo').text()
  82. abstract = response.doc('.post-summary').text()
  83. crawl_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  84. deomo_val = url + crawl_time
  85. md5 = hashlib.md5(deomo_val.encode('utf8')).hexdigest()
  86. """
  87. img = response.doc('.imgtelerik')
  88. img_url = img.attr.src
  89. if img_url:
  90. #获取图片文件后缀
  91. extension = self.tool.get_extension(img_url)
  92. #拼接图片名
  93. file_name = extension
  94. self.crawl(img_url,callback=self.save_img,save={"file_name":file_name},validate_cert=False)
  95. file_path = self.dir_path + file_name
  96. else:
  97. file_path = ''
  98. """
  99. c = conn.cursor() # 获取游标
  100. sql = '''
  101. insert into data (md5,url,title,content,content_cn,time,crawl_time,crawl_person,abstract,source)
  102. values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
  103. '''
  104. c.execute(sql, (
  105. md5, url, title, content, content_cn, date, crawl_time, crawl_person, abstract, website_url)) # 执行sql语句
  106. conn.commit() # 提交数据库操作
  107. c.close()
  108. #conn.close() # 关闭数据库连接
  109. return {
  110. "url": response.url,
  111. "title":title,
  112. "content": content,
  113. "date": date,
  114. "abstract": abstract,
  115. "content_cn": content_cn
  116. }
  117. # 保存图片
  118. def save_img(self, response):
  119. content = response.content
  120. file_name = response.save["file_name"]
  121. file_path = self.dir_path + file_name
  122. self.tool.save_img(content, file_path)
  123. # 工具类
  124. class Tool:
  125. def __init__(self):
  126. self.dir = DIR_PATH
  127. # 创建文件夹(如果不存在)
  128. if not os.path.exists(self.dir):
  129. os.makedirs(self.dir)
  130. # 保存图片
  131. def save_img(self, content, path):
  132. f = open(path, "wb")
  133. f.write(content)
  134. f.close()
  135. # 获取url后缀名
  136. def get_extension(self, url):
  137. extension = url.split("/")[-1]
  138. t = re.split('\?', extension)[0]
  139. return t
  140. # Google翻译
  141. import logging
  142. import urllib
  143. import urllib.request
  144. import urllib.parse
  145. import requests
  146. import execjs as execjs
  147. import json
  148. logger = logging.getLogger(__file__)
  149. from requests.packages import urllib3
  150. urllib3.disable_warnings()
  151. class Google():
  152. def __init__(self):
  153. self.lan_dict = {
  154. '中文': 'zh-CN',
  155. '英文': 'en',
  156. '俄文': 'ru',
  157. '法文': 'fr',
  158. '日文': 'ja',
  159. '韩文': 'ko'
  160. }
  161. self.headers = {
  162. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
  163. # self.url = 'https://translate.googleapis.com/translate_a/single'
  164. self.url = 'http://translate.google.cn/translate_a/single'
  165. self.session = requests.Session()
  166. self.session.keep_alive = False
  167. def getTk(self, text):
  168. return self.get_ctx().call("TL", text)
  169. def get_ctx(self):
  170. ctx = execjs.compile("""
  171. function TL(a) {
  172. var k = "";
  173. var b = 406644;
  174. var b1 = 3293161072;
  175. var jd = ".";
  176. var $b = "+-a^+6";
  177. var Zb = "+-3^+b+-f";
  178. for (var e = [], f = 0, g = 0; g < a.length; g++) {
  179. var m = a.charCodeAt(g);
  180. 128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),
  181. e[f++] = m >> 18 | 240,
  182. e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,
  183. e[f++] = m >> 6 & 63 | 128),
  184. e[f++] = m & 63 | 128)
  185. }
  186. a = b;
  187. for (f = 0; f < e.length; f++) a += e[f],
  188. a = RL(a, $b);
  189. a = RL(a, Zb);
  190. a ^= b1 || 0;
  191. 0 > a && (a = (a & 2147483647) + 2147483648);
  192. a %= 1E6;
  193. return a.toString() + jd + (a ^ b)
  194. };
  195. function RL(a, b) {
  196. var t = "a";
  197. var Yb = "+";
  198. for (var c = 0; c < b.length - 2; c += 3) {
  199. var d = b.charAt(c + 2),
  200. d = d >= t ? d.charCodeAt(0) - 87 : Number(d),
  201. d = b.charAt(c + 1) == Yb ? a >>> d: a << d;
  202. a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d
  203. }
  204. return a
  205. }
  206. """)
  207. return ctx
  208. def buildUrl(self, text, tk, sl, tl):
  209. baseUrl = 'http://translate.google.cn/translate_a/single'
  210. # baseUrl = 'https://translate.googleapis.com/translate_a/single'
  211. # baseUrl += '?client=webapp&' # 这里client改成webapp后翻译的效果好一些 t翻译的比较差 ..
  212. baseUrl += '?client=gtx&' # 这里client改成webapp后翻译的效果好一些 t翻译的比较差 ..
  213. baseUrl += 'sl=auto&'
  214. baseUrl += 'tl=' + str(tl) + '&'
  215. baseUrl += 'hl=zh-CN&'
  216. baseUrl += 'dt=at&'
  217. baseUrl += 'dt=bd&'
  218. baseUrl += 'dt=ex&'
  219. baseUrl += 'dt=ld&'
  220. baseUrl += 'dt=md&'
  221. baseUrl += 'dt=qca&'
  222. baseUrl += 'dt=rw&'
  223. baseUrl += 'dt=rm&'
  224. baseUrl += 'dt=ss&'
  225. baseUrl += 'dt=t&'
  226. baseUrl += 'ie=UTF-8&'
  227. baseUrl += 'oe=UTF-8&'
  228. baseUrl += 'clearbtn=1&'
  229. baseUrl += 'otf=1&'
  230. baseUrl += 'pc=1&'
  231. baseUrl += 'srcrom=0&'
  232. baseUrl += 'ssel=0&'
  233. baseUrl += 'tsel=0&'
  234. baseUrl += 'kc=2&'
  235. baseUrl += 'tk=' + str(tk) + '&'
  236. content = urllib.parse.quote(text)
  237. baseUrl += 'q=' + content
  238. return baseUrl
  239. def getHtml(self, session, url, headers):
  240. try:
  241. html = session.get(url, headers=headers)
  242. return html.json()
  243. except Exception as e:
  244. return None
  245. def translate(self, from_lan, to_lan, text):
  246. tk = self.getTk(text)
  247. url = self.buildUrl(text, tk, from_lan, to_lan)
  248. result = self.getHtml(self.session, url, self.headers)
  249. if result != None:
  250. ans = []
  251. s = ''
  252. for i in result[0]:
  253. if i[0] != None:
  254. s += i[0]
  255. return s
  256. else:
  257. logger.info('谷歌翻译失败 ')
  258. return None

https://baohaiquanvietnam.vn/

  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. # Created on 2021-04-14 09:22:11
  4. # Project: nhanda
  5. from pyspider.libs.base_handler import *
  6. import sqlite3, re, os
  7. import datetime
  8. import pymysql
  9. import hashlib
  10. # 图片保存路径
  11. website_url = 'https://baohaiquanvietnam.vn/'
  12. DIR_PATH = "image/vietNam_baohaiquanvietnam_vn/"
  13. module_list = ['danh-muc?id=1',
  14. 'danh-muc?id=6',
  15. 'danh-muc?id=9',
  16. ]
  17. conn = pymysql.connect(
  18. host='192.168.1.100',
  19. user='root',
  20. password='1qazXSW@',
  21. db='pyspider',
  22. charset='utf8',
  23. )
  24. crawl_person = 'dsw'
  25. class Handler(BaseHandler):
  26. crawl_config = {
  27. 'itag': 'v10'
  28. }
  29. def __init__(self):
  30. self.dir_path = DIR_PATH
  31. self.tool = Tool()
  32. self.google = Google()
  33. @every(minutes=24 * 60)
  34. def on_start(self):
  35. # 获取新闻板块
  36. for module in module_list:
  37. url = website_url + module
  38. self.crawl(url, callback=self.index_page, validate_cert=False,timeout=300)
  39. @config(age=10 * 24 * 60 * 60)
  40. def index_page(self, response):
  41. # 获取子版块分页信息及文章链接
  42. url = response.url
  43. print(url)
  44. #获取最后一页
  45. page_url_re = r'.pagination li:nth-last-child(2) a'
  46. laest_page = response.doc(page_url_re).text()
  47. print(laest_page)
  48. for each in range(int(laest_page)):
  49. self.crawl(url+'?page='+str(each+1), callback=self.get_detail_page, validate_cert=False,timeout=300)
  50. @config(priority=2)
  51. def get_detail_page(self, response):
  52. for ee in response.doc('.row .col-sm-8 a[href^="'+website_url+'"]').items():
  53. detail_page_url = ee.attr.href
  54. if len(detail_page_url.split('?page='))==1:
  55. #print('detail_page_url---',detail_page_url)
  56. self.crawl(detail_page_url, callback=self.detail_page, validate_cert=False,timeout=300)
  57. @config(priority=2)
  58. def detail_page(self, response):
  59. url = response.url
  60. title = response.doc('title').text()
  61. content = response.doc('.content_news').text()
  62. content_cn = self.google.translate('vi', 'zh-CN', content)
  63. date = response.doc('.breadcrumb li:nth-last-child(3)').text()
  64. abstract = response.doc('.que_news').text()
  65. crawl_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  66. deomo_val = url + crawl_time
  67. md5 = hashlib.md5(deomo_val.encode('utf8')).hexdigest()
  68. """
  69. img = response.doc('.imgtelerik')
  70. img_url = img.attr.src
  71. if img_url:
  72. #获取图片文件后缀
  73. extension = self.tool.get_extension(img_url)
  74. #拼接图片名
  75. file_name = extension
  76. self.crawl(img_url,callback=self.save_img,save={"file_name":file_name},validate_cert=False)
  77. file_path = self.dir_path + file_name
  78. else:
  79. file_path = ''
  80. """
  81. c = conn.cursor() # 获取游标
  82. sql = '''
  83. insert into data (md5,url,title,content,content_cn,time,crawl_time,crawl_person,abstract,source)
  84. values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
  85. '''
  86. c.execute(sql, (
  87. md5, url, title, content, content_cn, date, crawl_time, crawl_person, abstract, website_url)) # 执行sql语句
  88. conn.commit() # 提交数据库操作
  89. c.close()
  90. #conn.close() # 关闭数据库连接
  91. return {
  92. "url": response.url,
  93. "title":title,
  94. "content": content,
  95. "date": date,
  96. "abstract": abstract,
  97. "content_cn": content_cn
  98. }
  99. # 保存图片
  100. def save_img(self, response):
  101. content = response.content
  102. file_name = response.save["file_name"]
  103. file_path = self.dir_path + file_name
  104. self.tool.save_img(content, file_path)
  105. # 工具类
  106. class Tool:
  107. def __init__(self):
  108. self.dir = DIR_PATH
  109. # 创建文件夹(如果不存在)
  110. if not os.path.exists(self.dir):
  111. os.makedirs(self.dir)
  112. # 保存图片
  113. def save_img(self, content, path):
  114. f = open(path, "wb")
  115. f.write(content)
  116. f.close()
  117. # 获取url后缀名
  118. def get_extension(self, url):
  119. extension = url.split("/")[-1]
  120. t = re.split('\?', extension)[0]
  121. return t
  122. # Google翻译
  123. import logging
  124. import urllib
  125. import urllib.request
  126. import urllib.parse
  127. import requests
  128. import execjs as execjs
  129. import json
  130. logger = logging.getLogger(__file__)
  131. from requests.packages import urllib3
  132. urllib3.disable_warnings()
  133. class Google():
  134. def __init__(self):
  135. self.lan_dict = {
  136. '中文': 'zh-CN',
  137. '英文': 'en',
  138. '俄文': 'ru',
  139. '法文': 'fr',
  140. '日文': 'ja',
  141. '韩文': 'ko'
  142. }
  143. self.headers = {
  144. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
  145. # self.url = 'https://translate.googleapis.com/translate_a/single'
  146. self.url = 'http://translate.google.cn/translate_a/single'
  147. self.session = requests.Session()
  148. self.session.keep_alive = False
  149. def getTk(self, text):
  150. return self.get_ctx().call("TL", text)
  151. def get_ctx(self):
  152. ctx = execjs.compile("""
  153. function TL(a) {
  154. var k = "";
  155. var b = 406644;
  156. var b1 = 3293161072;
  157. var jd = ".";
  158. var $b = "+-a^+6";
  159. var Zb = "+-3^+b+-f";
  160. for (var e = [], f = 0, g = 0; g < a.length; g++) {
  161. var m = a.charCodeAt(g);
  162. 128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),
  163. e[f++] = m >> 18 | 240,
  164. e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,
  165. e[f++] = m >> 6 & 63 | 128),
  166. e[f++] = m & 63 | 128)
  167. }
  168. a = b;
  169. for (f = 0; f < e.length; f++) a += e[f],
  170. a = RL(a, $b);
  171. a = RL(a, Zb);
  172. a ^= b1 || 0;
  173. 0 > a && (a = (a & 2147483647) + 2147483648);
  174. a %= 1E6;
  175. return a.toString() + jd + (a ^ b)
  176. };
  177. function RL(a, b) {
  178. var t = "a";
  179. var Yb = "+";
  180. for (var c = 0; c < b.length - 2; c += 3) {
  181. var d = b.charAt(c + 2),
  182. d = d >= t ? d.charCodeAt(0) - 87 : Number(d),
  183. d = b.charAt(c + 1) == Yb ? a >>> d: a << d;
  184. a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d
  185. }
  186. return a
  187. }
  188. """)
  189. return ctx
  190. def buildUrl(self, text, tk, sl, tl):
  191. baseUrl = 'http://translate.google.cn/translate_a/single'
  192. # baseUrl = 'https://translate.googleapis.com/translate_a/single'
  193. # baseUrl += '?client=webapp&' # 这里client改成webapp后翻译的效果好一些 t翻译的比较差 ..
  194. baseUrl += '?client=gtx&' # 这里client改成webapp后翻译的效果好一些 t翻译的比较差 ..
  195. baseUrl += 'sl=auto&'
  196. baseUrl += 'tl=' + str(tl) + '&'
  197. baseUrl += 'hl=zh-CN&'
  198. baseUrl += 'dt=at&'
  199. baseUrl += 'dt=bd&'
  200. baseUrl += 'dt=ex&'
  201. baseUrl += 'dt=ld&'
  202. baseUrl += 'dt=md&'
  203. baseUrl += 'dt=qca&'
  204. baseUrl += 'dt=rw&'
  205. baseUrl += 'dt=rm&'
  206. baseUrl += 'dt=ss&'
  207. baseUrl += 'dt=t&'
  208. baseUrl += 'ie=UTF-8&'
  209. baseUrl += 'oe=UTF-8&'
  210. baseUrl += 'clearbtn=1&'
  211. baseUrl += 'otf=1&'
  212. baseUrl += 'pc=1&'
  213. baseUrl += 'srcrom=0&'
  214. baseUrl += 'ssel=0&'
  215. baseUrl += 'tsel=0&'
  216. baseUrl += 'kc=2&'
  217. baseUrl += 'tk=' + str(tk) + '&'
  218. content = urllib.parse.quote(text)
  219. baseUrl += 'q=' + content
  220. return baseUrl
  221. def getHtml(self, session, url, headers):
  222. try:
  223. html = session.get(url, headers=headers)
  224. return html.json()
  225. except Exception as e:
  226. return None
  227. def translate(self, from_lan, to_lan, text):
  228. tk = self.getTk(text)
  229. url = self.buildUrl(text, tk, from_lan, to_lan)
  230. result = self.getHtml(self.session, url, self.headers)
  231. if result != None:
  232. ans = []
  233. s = ''
  234. for i in result[0]:
  235. if i[0] != None:
  236. s += i[0]
  237. return s
  238. else:
  239. logger.info('谷歌翻译失败 ')
  240. return None

http://biengioilanhtho.gov.vn/

  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. # Created on 2021-04-14 09:22:11
  4. # Project: nhanda
  5. from pyspider.libs.base_handler import *
  6. import sqlite3, re, os
  7. import datetime
  8. import pymysql
  9. import hashlib
  10. # 图片保存路径
  11. website_url = 'http://biengioilanhtho.gov.vn/'
  12. DIR_PATH = "image/vietNam_baohaiquanvietnam_vn/"
  13. module_list = ['vi/chuyen-muc/thoi-su.html',
  14. 'vi/chuyen-muc/bien-gioi-viet-nam-lao.html', 'vi/chuyen-muc/bien-gioi-dat-lien-viet-nam-trung-quoc.html',
  15. 'vi/chuyen-muc/bien-gioi-dat-lien-viet-nam-campuchia.html', 'vi/chuyen-muc/bien-gioi-bien.html',
  16. 'vi/chuyen-muc/tu-lieu-lich-su.html',
  17. ]
  18. conn = pymysql.connect(
  19. host='192.168.1.100',
  20. user='root',
  21. password='1qazXSW@',
  22. db='pyspider',
  23. charset='utf8',
  24. )
  25. crawl_person = 'dsw'
  26. class Handler(BaseHandler):
  27. crawl_config = {
  28. 'itag': 'v7'
  29. }
  30. def __init__(self):
  31. self.dir_path = DIR_PATH
  32. self.tool = Tool()
  33. self.google = Google()
  34. @every(minutes=24 * 60)
  35. def on_start(self):
  36. # 获取新闻板块
  37. for module in module_list:
  38. url = website_url + module
  39. self.crawl(url, callback=self.index_page, validate_cert=False, timeout=300,save=os.path.splitext(os.path.basename(module))[0])
  40. @config(age=10 * 24 * 60 * 60)
  41. def index_page(self, response):
  42. # 获取子版块分页信息及文章链接
  43. module = response.save
  44. print(module)
  45. url = response.url
  46. print(url)
  47. # 获取最后一页
  48. laest_page = 5000
  49. print(laest_page)
  50. page_url = 'http://biengioilanhtho.gov.vn/ajaxpro/Office.Web.Frontend.NewsListRenderBll,Office.Web.Frontend.ashx'
  51. for each in range(int(laest_page)):
  52. data = {"Lang": "vi", "Cat": module, "PageIndex": each}
  53. self.crawl(page_url+'?page='+str(each), method='POST', data=json.dumps(data), callback=self.get_detail_page, validate_cert=False, timeout=300,
  54. headers={'Content-Type': 'application/json; charset=UTF-8',
  55. 'X-AjaxPro-Method':'DrawContent'})
  56. @config(priority=2)
  57. def get_detail_page(self, response):
  58. response.content = response.json['value']['HtmlContent']
  59. #print(response.doc(' a[href^="' + website_url + '"]'))
  60. for ee in response.doc('a[href^="' + website_url + '"]').items():
  61. detail_page_url = ee.attr.href
  62. print('detail_page_url---',detail_page_url)
  63. if len(detail_page_url.split('?page=')) == 1:
  64. # print('detail_page_url---',detail_page_url)
  65. self.crawl(detail_page_url, callback=self.detail_page, validate_cert=False, timeout=300)
  66. @config(priority=2)
  67. def detail_page(self, response):
  68. url = response.url
  69. title = response.doc('h2').text()
  70. content = response.doc('.content_detail p').text()
  71. content_cn = None
  72. #content_cn = self.google.translate('vi', 'zh-CN', content)
  73. date = response.doc('.title_detail span').text()
  74. abstract = response.doc('.content_detail1 p:nth-child(1)').text()
  75. crawl_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  76. deomo_val = url + crawl_time
  77. md5 = hashlib.md5(deomo_val.encode('utf8')).hexdigest()
  78. """
  79. img = response.doc('.imgtelerik')
  80. img_url = img.attr.src
  81. if img_url:
  82. #获取图片文件后缀
  83. extension = self.tool.get_extension(img_url)
  84. #拼接图片名
  85. file_name = extension
  86. self.crawl(img_url,callback=self.save_img,save={"file_name":file_name},validate_cert=False)
  87. file_path = self.dir_path + file_name
  88. else:
  89. file_path = ''
  90. """
  91. c = conn.cursor() # 获取游标
  92. sql = '''
  93. insert into data (md5,url,title,content,content_cn,time,crawl_time,crawl_person,abstract,source)
  94. values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
  95. '''
  96. c.execute(sql, (
  97. md5, url, title, content, content_cn, date, crawl_time, crawl_person, abstract, website_url)) # 执行sql语句
  98. conn.commit() # 提交数据库操作
  99. c.close()
  100. # conn.close() # 关闭数据库连接
  101. return {
  102. "url": response.url,
  103. "title": title,
  104. "content": content,
  105. "date": date,
  106. "abstract": abstract,
  107. "content_cn": content_cn
  108. }
  109. # 保存图片
  110. def save_img(self, response):
  111. content = response.content
  112. file_name = response.save["file_name"]
  113. file_path = self.dir_path + file_name
  114. self.tool.save_img(content, file_path)
  115. # 工具类
  116. class Tool:
  117. def __init__(self):
  118. self.dir = DIR_PATH
  119. # 创建文件夹(如果不存在)
  120. if not os.path.exists(self.dir):
  121. os.makedirs(self.dir)
  122. # 保存图片
  123. def save_img(self, content, path):
  124. f = open(path, "wb")
  125. f.write(content)
  126. f.close()
  127. # 获取url后缀名
  128. def get_extension(self, url):
  129. extension = url.split("/")[-1]
  130. t = re.split('\?', extension)[0]
  131. return t
  132. # Google翻译
  133. import logging
  134. import urllib
  135. import urllib.request
  136. import urllib.parse
  137. import requests
  138. import execjs as execjs
  139. import json
  140. logger = logging.getLogger(__file__)
  141. from requests.packages import urllib3
  142. urllib3.disable_warnings()
  143. class Google():
  144. def __init__(self):
  145. self.lan_dict = {
  146. '中文': 'zh-CN',
  147. '英文': 'en',
  148. '俄文': 'ru',
  149. '法文': 'fr',
  150. '日文': 'ja',
  151. '韩文': 'ko'
  152. }
  153. self.headers = {
  154. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
  155. # self.url = 'https://translate.googleapis.com/translate_a/single'
  156. self.url = 'http://translate.google.cn/translate_a/single'
  157. self.session = requests.Session()
  158. self.session.keep_alive = False
  159. def getTk(self, text):
  160. return self.get_ctx().call("TL", text)
  161. def get_ctx(self):
  162. ctx = execjs.compile("""
  163. function TL(a) {
  164. var k = "";
  165. var b = 406644;
  166. var b1 = 3293161072;
  167. var jd = ".";
  168. var $b = "+-a^+6";
  169. var Zb = "+-3^+b+-f";
  170. for (var e = [], f = 0, g = 0; g < a.length; g++) {
  171. var m = a.charCodeAt(g);
  172. 128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),
  173. e[f++] = m >> 18 | 240,
  174. e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,
  175. e[f++] = m >> 6 & 63 | 128),
  176. e[f++] = m & 63 | 128)
  177. }
  178. a = b;
  179. for (f = 0; f < e.length; f++) a += e[f],
  180. a = RL(a, $b);
  181. a = RL(a, Zb);
  182. a ^= b1 || 0;
  183. 0 > a && (a = (a & 2147483647) + 2147483648);
  184. a %= 1E6;
  185. return a.toString() + jd + (a ^ b)
  186. };
  187. function RL(a, b) {
  188. var t = "a";
  189. var Yb = "+";
  190. for (var c = 0; c < b.length - 2; c += 3) {
  191. var d = b.charAt(c + 2),
  192. d = d >= t ? d.charCodeAt(0) - 87 : Number(d),
  193. d = b.charAt(c + 1) == Yb ? a >>> d: a << d;
  194. a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d
  195. }
  196. return a
  197. }
  198. """)
  199. return ctx
  200. def buildUrl(self, text, tk, sl, tl):
  201. baseUrl = 'http://translate.google.cn/translate_a/single'
  202. # baseUrl = 'https://translate.googleapis.com/translate_a/single'
  203. # baseUrl += '?client=webapp&' # 这里client改成webapp后翻译的效果好一些 t翻译的比较差 ..
  204. baseUrl += '?client=gtx&' # 这里client改成webapp后翻译的效果好一些 t翻译的比较差 ..
  205. baseUrl += 'sl=auto&'
  206. baseUrl += 'tl=' + str(tl) + '&'
  207. baseUrl += 'hl=zh-CN&'
  208. baseUrl += 'dt=at&'
  209. baseUrl += 'dt=bd&'
  210. baseUrl += 'dt=ex&'
  211. baseUrl += 'dt=ld&'
  212. baseUrl += 'dt=md&'
  213. baseUrl += 'dt=qca&'
  214. baseUrl += 'dt=rw&'
  215. baseUrl += 'dt=rm&'
  216. baseUrl += 'dt=ss&'
  217. baseUrl += 'dt=t&'
  218. baseUrl += 'ie=UTF-8&'
  219. baseUrl += 'oe=UTF-8&'
  220. baseUrl += 'clearbtn=1&'
  221. baseUrl += 'otf=1&'
  222. baseUrl += 'pc=1&'
  223. baseUrl += 'srcrom=0&'
  224. baseUrl += 'ssel=0&'
  225. baseUrl += 'tsel=0&'
  226. baseUrl += 'kc=2&'
  227. baseUrl += 'tk=' + str(tk) + '&'
  228. content = urllib.parse.quote(text)
  229. baseUrl += 'q=' + content
  230. return baseUrl
  231. def getHtml(self, session, url, headers):
  232. try:
  233. html = session.get(url, headers=headers)
  234. return html.json()
  235. except Exception as e:
  236. return None
  237. def translate(self, from_lan, to_lan, text):
  238. tk = self.getTk(text)
  239. url = self.buildUrl(text, tk, from_lan, to_lan)
  240. result = self.getHtml(self.session, url, self.headers)
  241. if result != None:
  242. ans = []
  243. s = ''
  244. for i in result[0]:
  245. if i[0] != None:
  246. s += i[0]
  247. return s
  248. else:
  249. logger.info('谷歌翻译失败 ')
  250. return None