数据库设计如下

image.png
image.png

image.png

代码如下

  1. # -*- coding: utf-8 -*-
  2. import requests # 导入requests包
  3. import json
  4. import datetime
  5. import time
  6. import pymysql
  7. # 打开数据库链接
  8. db = pymysql.connect(
  9. host="localhost",
  10. port=3306,
  11. user="root",
  12. password="1234",
  13. database="database1"
  14. )
  15. # 存放微博用户数据
  16. userdata = []
  17. # 从数据库查询所有微博的账户信息
  18. def get_account_mes():
  19. # 使用cursor创建一个游标对象
  20. cursor = db.cursor()
  21. # 准备sql语句
  22. select_sql = """select * from users"""
  23. cursor.execute(select_sql)
  24. result = cursor.fetchall()
  25. for row in result:
  26. if row[1] == "微博":
  27. userdata.append(row)
  28. cursor.close()
  29. get_account_mes()
  30. # 提交数据至数据库
  31. def Submit_data_to_database(userdata, fbnr_data) -> str:
  32. try:
  33. # 提交该账户至账户数据库
  34. # 使用 cursor() 方法创建一个游标对象 cursor
  35. cursor = db.cursor()
  36. tmp = [userdata]
  37. # 准备插入用户账户数据的SQL
  38. user_sql = """INSERT INTO `weibo_account`(
  39. `Subordinate_units`, `name`, `phonenumber`, `platform`, `Account_name`, `weibohao`, `Number_of_vermicelli`,
  40. `Attention_number`, `Total_Weibo_number`, `authentication`, `brief_introduction`, `Statistical_time`)
  41. VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
  42. # 使用 execute() 方法执行 SQL 查询
  43. cursor.executemany(user_sql, tuple(tmp))
  44. # 准备插入用户发布内容的sql
  45. fbnr_data_sql = """INSERT INTO `weibo_works`(
  46. `Subordinate_units`, `name`, `phonenumber`, `platform`, `Account_name`, `weibohao`,
  47. `Weibo_content`, `Weibo_number`, `Forward_number`, `Like_number`, `Number_of_comments`,
  48. `original`, `Release_time`, `Statistical_time`)
  49. VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
  50. cursor.executemany(fbnr_data_sql, fbnr_data)
  51. # 关闭游标
  52. cursor.close()
  53. db.commit()
  54. return "提交成功"
  55. # 提交账户发布内容数据至内容数据库
  56. except Exception as err:
  57. return str(err)
  58. # 转换时间函数
  59. def trans_format(time_string, from_format, to_format='%Y.%m.%d %H:%M:%S'):
  60. """
  61. @note 时间格式转化
  62. :param time_string:
  63. :param from_format:
  64. :param to_format:
  65. :return:
  66. """
  67. time_struct = time.strptime(time_string, from_format)
  68. times = time.strftime(to_format, time_struct)
  69. return times
  70. # 构造请求头
  71. headers = {
  72. "cookie": "SUB=_2AkMVdLNgf8NxqwFRmP0cxW7jaY5-wwvEieKjKEK7JRMxHRl-yT_nqlIAtRB6PvSdgcJJunaym373dN91W1MIJArq7AzH; "
  73. "SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WhXawrTDmUF1NkVijRui0Nq; XSRF-TOKEN=etaNQZcst4rs9As_AFABQCIw; _"
  74. "s_tentry=weibo.com; Apache=5741302240361.778.1646804064448; SINAGLOBAL=5741302240361.778.1646804064448; "
  75. "ULV=1646804064507:1:1:1:5741302240361.778.1646804064448:; WBPSESS=lGn6cRy34B6AsqM-wzgd2I3xJQoNdKT6SN7Fn"
  76. "X0cs85ue7ykV44_MHoXYZ8pflevb5zjicdSLORoi8NsE9e4TLIieWK6K88wqnQQWv1OCjPWh7PkbuL85XSFk_elSupLSXOoaSpYmka7"
  77. "56SI5n5pJLbwdq81wNl0EaauOZM7zfE=",
  78. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
  79. "Chrome/92.0.4515.107 Safari/537.36 HBPC/12.0.0.300",
  80. }
  81. for usermes in userdata:
  82. try:
  83. if usermes is None:
  84. continue
  85. if usermes[5] != "肖战工作室":
  86. continue
  87. time.sleep(1)
  88. # 准备一个数组存储当前用户的数据
  89. thisWbUserData = []
  90. # 添加序号
  91. # thisWbUserData.append(len(wbUserData)+1)
  92. # 拼接url
  93. url = 'https://weibo.com/ajax/profile/info?custom=' + usermes[6]
  94. strHtml = requests.get(url=url, headers=headers, stream=True)
  95. strHtml.encoding = 'utf-8'
  96. mes = json.loads(strHtml.text)
  97. # 添加账号名称
  98. print("正在查询" + mes["data"]["user"]["screen_name"])
  99. # 添加所属单位
  100. thisWbUserData.append(usermes[2])
  101. # 添加姓名
  102. thisWbUserData.append(usermes[3])
  103. # 添加联系电话
  104. thisWbUserData.append(usermes[4])
  105. # 添加开设平台
  106. thisWbUserData.append(usermes[1])
  107. # 添加账号名称
  108. thisWbUserData.append(mes["data"]["user"]["screen_name"])
  109. # 添加微博号
  110. thisWbUserData.append(mes["data"]["user"]["idstr"])
  111. # 添加粉丝数
  112. thisWbUserData.append(str(mes["data"]["user"]["followers_count"]))
  113. # 添加关注数
  114. thisWbUserData.append(str(mes["data"]["user"]["friends_count"]))
  115. # 添加微博数
  116. thisWbUserData.append(str(mes["data"]["user"]["statuses_count"]))
  117. # 添加认证
  118. if "verified_reason" in mes["data"]["user"]:
  119. thisWbUserData.append(mes["data"]["user"]["verified_reason"])
  120. else:
  121. thisWbUserData.append("暂无认证")
  122. # 添加简介
  123. if mes["data"]["user"]["description"] == "":
  124. thisWbUserData.append("暂无简介")
  125. else:
  126. thisWbUserData.append(mes["data"]["user"]["description"])
  127. # 添加统计时间
  128. thisWbUserData.append(str(datetime.date.today()))
  129. # 统计该用户的所有微博数据
  130. # 构建数组存放微博文章数据
  131. # 存储文章
  132. wbUserAtricData = []
  133. # 先计算该用户的微博共有几页
  134. pagenum = 0
  135. if (int(mes["data"]["user"]["statuses_count"]) - 9) % 10 == 0:
  136. pagenum = int(int(int(mes["data"]["user"]["statuses_count"]) - 9) / 10) + 1
  137. elif int(mes["data"]["user"]["statuses_count"]) <= 9:
  138. pagenum = 1
  139. else:
  140. pagenum = int(int(int(mes["data"]["user"]["statuses_count"]) - 9) / 10) + 2
  141. if pagenum >= 100:
  142. pagenum = 99
  143. num = 0
  144. isbreak = False
  145. for i in range(1, pagenum + 1):
  146. time.sleep(3)
  147. # 拼接url
  148. pageurl = "https://weibo.com/ajax/statuses/mymblog?uid=" + usermes[6] + "&page=" + str(i) + "&feature=0"
  149. pageMes = requests.get(url=pageurl, headers=headers, stream=True)
  150. time.sleep(2)
  151. pageJson = json.loads(pageMes.text)
  152. # 如果长度为0说明服务器返回数据有误
  153. if len(pageJson["data"]["list"]) == 0:
  154. isbreak = True
  155. break
  156. for i in pageJson["data"]["list"]:
  157. # 如果时间小于限定时间,则直接break
  158. if datetime.datetime.strptime(
  159. trans_format(i["created_at"], '%a %b %d %H:%M:%S +0800 %Y', '%Y-%m-%d %H:%M:%S'),
  160. '%Y-%m-%d %H:%M:%S') < datetime.datetime.strptime('2022-01-01 00:00:00', '%Y-%m-%d %H:%M:%S'):
  161. isbreak = True
  162. break
  163. # 准备一个数组存放这篇文章的数据
  164. thisArticData = []
  165. print(mes["data"]["user"]["screen_name"] + "第" + str(num) + "篇微博")
  166. num = num + 1
  167. # 添加所属单位
  168. thisArticData.append(usermes[2])
  169. # 添加姓名
  170. thisArticData.append(usermes[3])
  171. # 添加联系电话
  172. thisArticData.append(usermes[4])
  173. # 添加开设平台
  174. thisArticData.append(usermes[1])
  175. # 添加账号名
  176. thisArticData.append(mes["data"]["user"]["screen_name"])
  177. # 添加微博号
  178. thisArticData.append(mes["data"]["user"]["idstr"])
  179. # 添加微博内容
  180. if len(i["text_raw"]) > 100:
  181. thisArticData.append((i["text_raw"][:100]).replace("\u200b", "").replace("\n", "") + "...")
  182. else:
  183. thisArticData.append((i["text_raw"]).replace("\u200b", "").replace("\n", ""))
  184. # 添加微博编号
  185. thisArticData.append(i["idstr"])
  186. # 添加转发数
  187. thisArticData.append(str(i['reposts_count']))
  188. # 添加点赞数
  189. thisArticData.append(str(i['attitudes_count']))
  190. # 添加评论数
  191. thisArticData.append(str(i['comments_count']))
  192. if 'page_info' in i or 'retweeted_status' in i:
  193. thisArticData.append("转发")
  194. else:
  195. thisArticData.append("原创")
  196. # 添加发布时间
  197. tmp_time = trans_format(i["created_at"], '%a %b %d %H:%M:%S +0800 %Y', '%Y-%m-%d %H:%M:%S')
  198. # print(tmp_time)
  199. thisArticData.append(tmp_time)
  200. # 添加统计时间
  201. thisArticData.append(str(datetime.date.today()))
  202. # 合并至总数组
  203. wbUserAtricData.append(tuple(thisArticData))
  204. # 如果已经查询到日期早于限定的文章,则直接退出
  205. if isbreak:
  206. print("后续微博日期不符合要求,即将提交数据并查询下一位用户")
  207. break
  208. s = Submit_data_to_database(tuple(thisWbUserData), tuple(wbUserAtricData))
  209. print(s)
  210. except Exception as e:
  211. print(e)
  212. # 关闭数据库连接
  213. db.close()