Author:Wei Meng-Xin (RIEM,SWUFE)
    Time:10/25/2018

    结论:

    1. 我遍历了约20%的雪球用户,检索到近9000个活跃实盘用户,预估所有实盘用户在50000个左右

    2. 样本中约9000个实盘用户,大约一半近期实盘有操作;另外一半实盘被关闭,应该可以检索到历史数据。

    3. 基于目前样本,发现1265个有明确地理位置的用户,其地区分布与雪球官方公布的地区分布基本一致。

    详细数据:

    1. 本次遍历雪球网,初步检索到 8253 个有实盘数据的用户,其中:

      1. 现在依旧活跃的实盘用户:3808

      2. 目前已经关停实盘,只保留历史调仓记录的用户:3111个

      3. 网站数据异常,需要再处理的用户:1118个

      4. 网络连接超时,需要再处理的用户:276个

    1. 3808个依旧活跃的实盘用户中,有明确地理位置的有1265个,地理位置分布如下:

    雪球网实盘用户地理位置V1 - 图1

    1. 下图是雪球官方公布的用户分布图:

    雪球网实盘用户地理位置V1 - 图2

    1. cookie = [dict(cookies_are="device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; aliyungf_tc=AQAAAEKOuhTXlwQABPNZ2lE9FlkxU221; snbim_minify=true; __utmc=1; __utma=1.191434752.1526174181.1540635588.1540638493.40; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539567244,1539567517,1540450735,1540638620; _gid=GA1.2.1574455490.1540638621; xq_a_token=18b7f7dec4f54032863219716eaf839ee940199d; xqat=18b7f7dec4f54032863219716eaf839ee940199d; xq_r_token=f27bcc9f6c7b6446279ee9448db195b118b8f17c; xq_token_expire=Wed%20Nov%2021%202018%2019%3A41%3A19%20GMT%2B0800%20(CST); xq_is_login=1; u=7147604028; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540641065; __utmb=1.37.9.1540640362715"),
    2. dict(cookie_are="device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; aliyungf_tc=AQAAAEKOuhTXlwQABPNZ2lE9FlkxU221; snbim_minify=true; __utmc=1; __utma=1.191434752.1526174181.1540635588.1540638493.40; __utmt=1; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539567244,1539567517,1540450735,1540638620; _gid=GA1.2.1574455490.1540638621; xq_a_token=4458f8df93a013c35835d0320917b19dcaab0a24; xqat=4458f8df93a013c35835d0320917b19dcaab0a24; xq_r_token=4812b56991883e9913998e8816706912bff911e8; xq_is_login=1; u=6146826778; xq_token_expire=Wed%20Nov%2021%202018%2019%3A17%3A51%20GMT%2B0800%20(CST); Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540638965; __utmb=1.8.9.1540638934329"),
    3. dict(cookie_are="device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; aliyungf_tc=AQAAAEKOuhTXlwQABPNZ2lE9FlkxU221; snbim_minify=true; __utmc=1; __utma=1.191434752.1526174181.1540635588.1540638493.40; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539567244,1539567517,1540450735,1540638620; _gid=GA1.2.1574455490.1540638621; __utmt=1; xq_a_token=b2f21e25cd1817bf15c1c89cc72b25ad537495de; xqat=b2f21e25cd1817bf15c1c89cc72b25ad537495de; xq_r_token=bb8e27cca180872ab70314097a5077578ff119c8; xq_is_login=1; u=1559188240; xq_token_expire=Wed%20Nov%2021%202018%2019%3A24%3A59%20GMT%2B0800%20(CST); Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540639362; __utmb=1.14.9.1540638934329"),
    4. dict(cookie_are="device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; aliyungf_tc=AQAAAEKOuhTXlwQABPNZ2lE9FlkxU221; snbim_minify=true; __utmc=1; __utma=1.191434752.1526174181.1540635588.1540638493.40; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539567244,1539567517,1540450735,1540638620; _gid=GA1.2.1574455490.1540638621; __utmt=1; xq_a_token=b70e7188d32f804237b6a42c052b5bcf74ebeea2; xqat=b70e7188d32f804237b6a42c052b5bcf74ebeea2; xq_r_token=b004ebba4649dfef7bba54f6ae7b703e5bca6a61; xq_token_expire=Wed%20Nov%2021%202018%2019%3A27%3A29%20GMT%2B0800%20(CST); xq_is_login=1; u=1497969916; _gat_gtag_UA_16079156_4=1; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540639507; __utmb=1.18.9.1540639426395")]
    5. def get_location(num):
    6. res = []
    7. import requests
    8. import random
    9. import time
    10. import json
    11. url = u"https://xueqiu.com/statuses/original/show.json?user_id=" + num[3]
    12. url_1 = u"https://xueqiu.com/account/oauth/user/show.json?source=sina&userid=" + num[3]
    13. headers = [{'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
    14. 'Accept': 'text/html;q=0.9,*/*;q=0.8',
    15. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    16. 'Connection': 'close'},
    17. {'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML, like Gecko) CriOS/69.0.3497.100 Mobile/13B143 Safari/601.1.46",
    18. 'Accept': 'text/html;q=0.9,*/*;q=0.8',
    19. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    20. 'Connection': 'close'},
    21. {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
    22. 'Accept': 'text/html;q=0.9,*/*;q=0.8',
    23. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    24. 'Connection': 'close'},
    25. {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
    26. 'Accept': 'application/json, text/plain, */*',
    27. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    28. 'Connection': 'close'},
    29. {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
    30. 'Accept': 'application/json, text/plain, */*',
    31. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    32. 'Connection': 'close'}]
    33. cookie = [dict(cookies_are=u"device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; aliyungf_tc=AQAAADY46BenmA0AefNZ2iIVV7Y6rtgH; __utmc=1; xq_a_token.sig=XglA1uiAYkfyfKlhbuJdRhRTTM4; xq_r_token.sig=jW7KrLgtGYffUvfG3DfPexDR8RQ; xq_a_token=7c41909f4604aa33eb26b7c175f0468a1df2152b; xqat=7c41909f4604aa33eb26b7c175f0468a1df2152b; xq_r_token=b1914a7d50798c67bb7852f09954b82aa41a4a0b; xq_token_expire=Thu%20Nov%2008%202018%2022%3A39%3A32%20GMT%2B0800%20(CST); xq_is_login=1; u=7147604028; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; snbim_minify=true; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539526506,1539528869,1539567244,1539567517; _gid=GA1.2.1869629804.1540126302; __utma=1.191434752.1526174181.1540126285.1540168048.33; __utmt=1; __utmb=1.31.10.1540168048; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540171090"),
    34. dict(cookie_are=u"device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; aliyungf_tc=AQAAAEKOuhTXlwQABPNZ2lE9FlkxU221; snbim_minify=true; __utmc=1; __utma=1.191434752.1526174181.1540635588.1540638493.40; __utmt=1; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539567244,1539567517,1540450735,1540638620; _gid=GA1.2.1574455490.1540638621; xq_a_token=4458f8df93a013c35835d0320917b19dcaab0a24; xqat=4458f8df93a013c35835d0320917b19dcaab0a24; xq_r_token=4812b56991883e9913998e8816706912bff911e8; xq_is_login=1; u=6146826778; xq_token_expire=Wed%20Nov%2021%202018%2019%3A17%3A51%20GMT%2B0800%20(CST); Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540638965; __utmb=1.8.9.1540638934329"),
    35. dict(cookie_are="device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; aliyungf_tc=AQAAAEKOuhTXlwQABPNZ2lE9FlkxU221; snbim_minify=true; __utmc=1; __utma=1.191434752.1526174181.1540635588.1540638493.40; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539567244,1539567517,1540450735,1540638620; _gid=GA1.2.1574455490.1540638621; __utmt=1; xq_a_token=b2f21e25cd1817bf15c1c89cc72b25ad537495de; xqat=b2f21e25cd1817bf15c1c89cc72b25ad537495de; xq_r_token=bb8e27cca180872ab70314097a5077578ff119c8; xq_is_login=1; u=1559188240; xq_token_expire=Wed%20Nov%2021%202018%2019%3A24%3A59%20GMT%2B0800%20(CST); Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540639362; __utmb=1.14.9.1540638934329"),
    36. dict(cookie_are="device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; aliyungf_tc=AQAAAEKOuhTXlwQABPNZ2lE9FlkxU221; snbim_minify=true; __utmc=1; __utma=1.191434752.1526174181.1540635588.1540638493.40; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539567244,1539567517,1540450735,1540638620; _gid=GA1.2.1574455490.1540638621; __utmt=1; xq_a_token=b70e7188d32f804237b6a42c052b5bcf74ebeea2; xqat=b70e7188d32f804237b6a42c052b5bcf74ebeea2; xq_r_token=b004ebba4649dfef7bba54f6ae7b703e5bca6a61; xq_token_expire=Wed%20Nov%2021%202018%2019%3A27%3A29%20GMT%2B0800%20(CST); xq_is_login=1; u=1497969916; _gat_gtag_UA_16079156_4=1; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540639507; __utmb=1.18.9.1540639426395")]
    37. s = requests.Session()
    38. s.keep_alive = False
    39. try:
    40. obj = s.get(url, headers=random.choice(headers), cookies=random.choice(cookie), stream=True, allow_redirects=False).json()
    41. time.sleep(random.random() * 16)
    42. name = obj['user']['screen_name']
    43. gender = obj['user']['gender']
    44. province = obj['user']['province']
    45. city = obj['user']['city']
    46. followers_count = obj['user']['followers_count']
    47. friends_count = obj['user']['friends_count']
    48. status_count = obj['user']['status_count']
    49. stocks_count = obj['user']['stocks_count']
    50. res.append(num[2])
    51. res.append(num[3])
    52. res.append(name)
    53. res.append(gender)
    54. res.append(province)
    55. res.append(city)
    56. res.append(followers_count)
    57. res.append(friends_count)
    58. res.append(status_count)
    59. res.append(stocks_count)
    60. try:
    61. obj_1 = s.get(url_1, headers=random.choice(headers), cookies=random.choice(cookie), stream=True,
    62. allow_redirects=False).json()
    63. time.sleep(random.random() * 15)
    64. weibo_uid = obj_1['id']
    65. res.append(weibo_uid)
    66. s.close()
    67. print(res)
    68. return res
    69. except KeyError or json.decoder.JSONDecodeError or IndexError:
    70. res.append("weibo地址不存在")
    71. s.close()
    72. print(res)
    73. return res
    74. except json.decoder.JSONDecodeError:
    75. res.append(num)
    76. res.append("异常")
    77. s.close
    78. print(res)
    79. return res
    80. xueqiu_all = xueqiu_all_data[1:]
    81. if __name__ == "__main__":
    82. final = []
    83. for num in xueqiu_all[0::40]:
    84. try:
    85. data = get_location(num)
    86. final.append(data)
    87. except KeyError:
    88. print("KeyError")
    89. pass
    1. t = 0
    2. for i in test[1:]:
    3. if i[5] in ["北京", "上海", "天津"] or i[6] not in ["", "未知", "其他", "异常", "不限", "城市/地区", None]:
    4. t += 1
    5. print(i)
    6. print(t)