一、介绍

最近JK又有事情交给我做,是的,又要让我爬网页了。。。这次是光明日报。
对于这种网页,我最怕的是反爬机制很厉害,特别是IP限制和验证码识别。。。还好,这次爬取的网页没有防爬机制,直接拿,都不用停歇的那种。。。

首先,2008年~2020年的一级网页链接:http://epaper.gmw.cn/

然后,1998年~2007年的一级网页的链接比较杂乱。全部的结果从这里进入http://epaper.gmw.cn/gmrbdb/

那么对应的时间段,使用不同的基本链接即可,然后往里面填充对应的日期、或者版块信息即可。

二、2008~2020年的一级网页的抓取

代码

  1. import requests
  2. from lxml import etree
  3. import pandas as pd
  4. from sqlalchemy import create_engine
  5. import pymysql
  6. from sqlalchemy.types import VARCHAR
  7. pymysql.install_as_MySQLdb()
  8. def df_to_sql(df, root, pw, db, name, idntt):
  9. basic_url = 'mysql+mysqldb://{root}:{pw}@localhost:3306/{db}?charset=utf8mb4'.format(root=root, pw=pw, db=db)
  10. engine = create_engine(basic_url)
  11. conn = engine.connect()
  12. df.to_sql(name, conn, if_exists='append', dtype={idntt: VARCHAR(df.index.get_level_values(idntt).str.len().max())})
  13. conn.close()
  14. pass
  15. def get_html(url, decodes='utf-8'):
  16. # 第一次获取html,可能存在验证码
  17. rsp = requests.get(url)
  18. html = rsp.content.decode(decodes, 'ignore').replace('\t', '').replace('\n', '')
  19. return html
  20. def get_dom_1time(html, url, block_tick):
  21. # block_tick = '01' '02'等
  22. dom = etree.HTML(html)
  23. # 二级网页文章的所有链接
  24. article_link = dom.xpath(r'.//div[@class="list_l"]//li//a//@href')
  25. article_link1 = [url + i for i in article_link]
  26. # 二级网页文章的总数量
  27. art_len = len(article_link1)
  28. # block_tick_list = ['01', '01', '01']
  29. block_tick_list = [block_tick] * art_len
  30. # 二级网页的版块信息
  31. # block_name_list = ['版块1:要闻', '版块2:经济', '版块3:国际'], 一次性
  32. block_name_list = dom.xpath(r'.//div[@class="list_r"]//a[@href]//text()')
  33. block_total = len(block_name_list)
  34. # block_num_list = ['01', '02', '03', '04'], 一次性
  35. block_num_list = [str(i).zfill(2) for i in range(1, block_total + 1)]
  36. # block_name_tick_list = ['要闻', '要闻', '要闻', '要闻']
  37. block_name_tick_list = [block_name_list[0]] * art_len
  38. return article_link1, block_tick_list, block_name_tick_list, block_num_list, block_name_list
  39. def get_dom_2time(html, url, block_tick, block_name_tick):
  40. # block_tick = '01' '02'等, block_name_tick = '要闻'等
  41. dom = etree.HTML(html)
  42. article_link = dom.xpath(r'.//div[@class="list_l"]//li//a//@href')
  43. article_link1 = [url + i for i in article_link]
  44. art_len = len(article_link1)
  45. block_tick_list = [block_tick] * art_len
  46. block_name_tick_list = [block_name_tick] * art_len
  47. return article_link1, block_tick_list, block_name_tick_list
  48. def get_date_list(start, end):
  49. date = pd.DataFrame(pd.date_range(start, end, freq='D'), columns=['date'])
  50. date['date'] = date['date'].apply(lambda x: str(x))
  51. date['year-month'] = date['date'].apply(lambda x: x[:7])
  52. date['day'] = date['date'].apply(lambda x: x[8:10])
  53. y_m = list(date['year-month'])
  54. d = list(date['day'])
  55. return y_m, d
  56. def get_df(first_basic_url, second_basic_url, year_month, day):
  57. # 拼出第一页的一级页面,block始终=01,之后更换block
  58. url1 = first_basic_url.format(year_month=year_month, day=day, block='01')
  59. # 拼出二级页面的开头,不再改变。
  60. url2 = second_basic_url.format(year_month=year_month, day=day)
  61. # 一级网页的第一页信息,block='01'
  62. html = get_html(url1)
  63. # 需要block_num_list信息,为了下面的信息。
  64. article_link, block_tick_list, block_name_tick_list, block_num_list, block_name_list = get_dom_1time(html, url2, '01')
  65. length = len(block_name_list)
  66. # 一级网页第2页之后的信息
  67. for i in range(1, length):
  68. block_tick = block_num_list[i]
  69. block_name_tick = block_name_list[i]
  70. url1 = first_basic_url.format(year_month=year_month, day=day, block=block_tick)
  71. html = get_html(url1)
  72. try:
  73. article_link1, block_tick_list1, block_name_tick_list1 = get_dom_2time(html, url2, block_tick, block_name_tick)
  74. article_link.extend(article_link1)
  75. block_tick_list.extend(block_tick_list1)
  76. block_name_tick_list.extend(block_name_tick_list1)
  77. except:
  78. print("Error %s "%url1)
  79. df = pd.DataFrame([block_tick_list, block_name_tick_list, article_link],
  80. index=['block_tick', 'block_name', 'article_link']).T
  81. df['year_month'] = year_month
  82. df['day'] = day
  83. return df
  84. if __name__ == "__main__":
  85. # 得到日期list
  86. y_m, d = get_date_list('2008-01-01', '2020-09-29')
  87. # 一级网页的基础链接,和二级网页的基础链接,这里仅仅是实验,需要全部的内容爬完整就行。
  88. first_basic_url = r'https://epaper.gmw.cn/gmrb/html/{year_month}/{day}/nbs.D110000gmrb_{block}.htm'
  89. second_basic_url = r'https://epaper.gmw.cn/gmrb/html/{year_month}/{day}/'
  90. for i in range(len(y_m)):
  91. year_month = y_m[i]
  92. day=d[i]
  93. print(year_month, day)
  94. df = get_df(first_basic_url, second_basic_url, year_month, day)
  95. df = df.set_index(['block_tick'], inplace=False)
  96. df_to_sql(df, 'root', 'pw', '光明日报', 'first_link0820', 'block_tick')

三、1997~2007年的网页抓取

import requests
from lxml import etree
import pandas as pd
from sqlalchemy import create_engine
import pymysql
from sqlalchemy.types import VARCHAR
pymysql.install_as_MySQLdb()

def df_to_sql(df, root, pw, db, name, idntt):
    basic_url = 'mysql+mysqldb://{root}:{pw}@localhost:3306/{db}?charset=utf8mb4'.format(root=root, pw=pw, db=db)
    engine = create_engine(basic_url)
    conn = engine.connect()
    df.to_sql(name, conn, if_exists='append',
              dtype={idntt: VARCHAR(df.index.get_level_values(idntt).str.len().max())})
    conn.close()
    pass


def sql_to_df(one_sql, root, pw, db):
    basic_url = 'mysql+mysqldb://{root}:{pw}@localhost:3306/{db}?charset=utf8'.format(
        root=root, pw=pw, db=db)
    engine = create_engine(basic_url)
    conn = engine.connect()
    df = pd.read_sql(one_sql, conn)
    conn.close()
    return df


def get_date_list(start, end):
    date = pd.DataFrame(pd.date_range(start, end, freq='D'), columns=['date'])
    date['date'] = date['date'].apply(lambda x: str(x)[:10])
    date['year'] = date['date'].apply(lambda x: x[:4])
    date['month'] = date['date'].apply(lambda x: x[5:7])
    date['day'] = date['date'].apply(lambda x: x[8:10])
    y = list(date['year'])
    m = list(date['month'])    
    d = list(date['day'])
    dt = list(date['date'])
    return y, m, d, dt


def get_html(url, decodes='utf-8'):
    rsp = requests.get(url)
    html = rsp.content.decode(decodes, 'ignore').replace('\t', '').replace('\n', '')
    return html


def get_dom(html, basic_url, year, month, day):
    dom = etree.HTML(html)
    # blocks = dom.xpath(r'.//div[@class="channelLeftPart"]//b//text()')
    every_url = dom.xpath(r'.//div[@class="channelLeftPart"]//ul[@class="channel-newsGroup"]//li//span//a//@href')
    every_titile = dom.xpath(r'.//div[@class="channelLeftPart"]//ul[@class="channel-newsGroup"]//li//span//a//@title')
    every_url1 = [basic_url.format(year=year, month=month, day=day, item=u) for u in every_url]
    return every_titile, every_url1    


def get_df(title_list, url_list, date):
    df = pd.DataFrame([title_list, url_list], index=['title', 'url']).T
    df['date'] = date
    df = df.set_index(['date'], inplace=False)
    return df


### 1998~1999全年的链接
basic_url1 = r'http://www.gmw.cn/01gmrb/{year}-{month}/{day}/GB/{item}'
y, m, d, dt = get_date_list('1998-01-01', '1999-12-31')
ymddt = zip(y, m, d, dt)
# ymddt = zip(y[:10], m[:10], d[:10], dt[:10])

for y1, m1, d1, dt1 in ymddt:
    print(dt1)
    url1 = basic_url1.format(year=y1, month=m1, day=d1, item='DEFAULT.HTM')
    try:
        html = get_html(url1)
        every_titile, every_url = get_dom(html, basic_url1, y1, m1, d1)
        df = get_df(every_titile, every_url, dt1)
        df_to_sql(df, 'root', 'pw', '光明日报', 'first_link9807', 'date')
    except:
        print("Error : %s"%url1)

### 2000年全年~2001.04.19        
basic_url2 = r'http://www.gmw.cn/01gmrb/{year}-{month}/{day}/GB/{item}'
y, m, d, dt = get_date_list('2000-01-01', '2000-04-19')

ymddt = zip(y, m, d, dt)
# ymddt = zip(y[:10], m[:10], d[:10], dt[:10])

for y1, m1, d1, dt1 in ymddt:
    print(dt1)
    url1 = basic_url2.format(year=y1, month=m1, day=d1, item='default.htm')
    try:
        html = get_html(url1)
        every_titile, every_url = get_dom(html, basic_url2, y1, m1, d1)
        df = get_df(every_titile, every_url, dt1)
        df_to_sql(df, 'root', 'pw', '光明日报', 'first_link9807', 'date')
    except:
        print("Error : %s"%url1)


### 2001.04.20~2004.02.28
basic_url3 = r'http://www.gmw.cn/01gmrb/{year}-{month}/{day}/{item}'
item = '{year}-{month}-{day}-Homepage.htm'
y, m, d, dt = get_date_list('2001-04-20', '2004-02-28')
ymddt = zip(y, m, d, dt)
# ymddt = zip(y[:10], m[:10], d[:10], dt[:10])

for y1, m1, d1, dt1 in ymddt:
    print(dt1)
    item1 = item.format(year=y1, month=m1, day=d1)
    url1 = basic_url3.format(year=y1, month=m1, day=d1, item=item1)

    try:
        html = get_html(url1)
        every_titile, every_url = get_dom(html, basic_url3, y1, m1, d1)
        df = get_df(every_titile, every_url, dt1)
        df_to_sql(df, 'root', 'pw', '光明日报', 'first_link9807', 'date')
    except:
        print("Error : %s"%url1)



###  2004.02.29~2007.12.31
basic_url4 = r'http://www.gmw.cn/01gmrb/{year}-{month}/{day}/{item}'

y, m, d, dt = get_date_list('2004-02-29', '2007-12-31')
# ymddt = zip(y, m, d, dt)
ymddt = zip(y[:10], m[:10], d[:10], dt[:10])

for y1, m1, d1, dt1 in ymddt:
    print(dt1)
    url1 = basic_url4.format(year=y1, month=m1, day=d1, item='default.htm')
    try:
        html = get_html(url1)
        every_titile, every_url = get_dom(html, basic_url4, y1, m1, d1)
        df = get_df(every_titile, every_url, dt1)
        df_to_sql(df, 'root', 'pw', '光明日报', 'first_link9807', 'date')
    except:
        print("Error : %s"%url1)

四、2008~2020二级网页的爬取

再爬取二级网页的内容。在爬取的时候,注意需要使用headers信息,每台机器都不一样。
使用google浏览器打开2008年的一个网址,https://epaper.gmw.cn/gmrb/html/2008-01/01/nw.D110000gmrb_20080101_1-01.htm?div=-1
然后通过Ctrl+Shift+I查看headers。
image.png
那么把headers中的信息全部取出来,放到requests.get中。

from sqlalchemy import create_engine
import pymysql
import pandas as pd
import requests
from lxml import etree
import time
from sqlalchemy.types import VARCHAR
pymysql.install_as_MySQLdb()



def get_headers():
    h = {}        
    h['accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
    h['accept-encoding'] = 'gzip, deflate, br'
    h['accept-language'] = 'zh-CN,zh;q=0.9'
    h['cache-control'] = 'max-age=0'
    h['cookie'] = 'wdcid=2092f178b0f38087; _ga=GA1.2.434333475.1601462744; __auc=f9270667174de9e32c080867a0e; JSESSIONID=38DBDBA2673365EC3BBD12B822D079D0; wdses=590ee4d1d7c5eb0b; __asc=b92e75d6174f7ac22b1fcd6e6a7; _gid=GA1.2.2141714287.1601883088; wdlast=1601883189'
    h['sec-fetch-dest'] = 'document'
    h['sec-fetch-mode'] = 'navigate'
    h['sec-fetch-user'] = '?1'
    h['upgrade-insecure-requests'] = '1'
    h['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
    return h

def df_to_sql(df, root, pw, db, name, idntt):
    basic_url = 'mysql+mysqldb://{root}:{pw}@localhost:3306/{db}?charset=utf8mb4'.format(root=root, pw=pw, db=db)
    engine = create_engine(basic_url)
    conn = engine.connect()
    df.to_sql(name, conn, if_exists='append',
              dtype={idntt: VARCHAR(df.index.get_level_values(idntt).str.len().max())})
    conn.close()
    pass


def sql_to_df(one_sql, root, pw, db):
    basic_url = 'mysql+mysqldb://{root}:{pw}@localhost:3306/{db}?charset=utf8'.format(
        root=root, pw=pw, db=db)
    engine = create_engine(basic_url)
    conn = engine.connect()
    df = pd.read_sql(one_sql, conn)
    conn.close()
    return df


def get_html(url, decodes='utf-8'):
    # 第一次获取html,可能存在验证码
    rsp = requests.get(url, headers = get_headers())
    html = rsp.content.decode(decodes, 'ignore').replace('\t', '').replace('\n', '')
    return html


def get_article(html):

    dom = etree.HTML(html)
    title = dom.xpath(r'.//div[@class="text_c"]//h1//text()')
    title1 = ''
    for item in title:
        title1 += item.replace(' ', '').replace('\xa0', '').replace('\r', '')        
    article = dom.xpath(r'.//div[@id="articleContent"]//text()')
    article1 = ''
    for item in article:
        item = item.replace(' ', '').replace('\xa0', '').replace('\r', '')
        article1 += item        
    return title1, article1


def get_df(title, article, url):
    df = pd.DataFrame([url, title, article], index=['url', 'title', 'article']).T
    df = df.set_index(['url'], inplace=False)
    return df

### first_link即为我刚刚爬取的存在MySQL中的链接。
first_link = sql_to_df("""select * from first_link0820""", 'root', 'pw', '光明日报')

for url in list(first_link['article_link'])
    try:
        html = get_html(url)
        title, article = get_article(html)
        one_link = get_df(title, article, url)
        df_to_sql(one_link, 'root', 'pw', '光明日报', 'second_page1', 'url')
    except:
        print('Error url %s'%url)

五、1998~2007年的二级网页

同样,这里需要加入headers。
使用google浏览器打开1998年的一个网址,http://www.gmw.cn/01gmrb/1998-01/01/GB/17560^GM1-0104.HTM 然后通过Ctrl+Shift+I查看headers。
image.png

from sqlalchemy import create_engine
import pymysql
import pandas as pd
import requests
from lxml import etree
import time
from sqlalchemy.types import VARCHAR
pymysql.install_as_MySQLdb()

def get_headers():
    h = {}       
    h['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
    h['Accept-Encoding'] = 'gzip, deflate'
    h['Accept-Language'] = 'zh-CN,zh;q=0.9'    
    h['Connection'] = 'keep-alive'
    h['Cookie'] = 'wdcid=2092f178b0f38087; _ga=GA1.2.434333475.1601462744; __auc=f9270667174de9e32c080867a0e; wdlast=1601695399; __asc=b92e75d6174f7ac22b1fcd6e6a7; _gid=GA1.2.2141714287.1601883088'
    h['Host'] = 'www.gmw.cn'    
    h['Upgrade-Insecure-Requests'] = '1'
    h['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'

    return h

def df_to_sql(df, root, pw, db, name, idntt):
    basic_url = 'mysql+mysqldb://{root}:{pw}@localhost:3306/{db}?charset=utf8mb4'.format(root=root, pw=pw, db=db)
    engine = create_engine(basic_url)
    conn = engine.connect()
    df.to_sql(name, conn, if_exists='append',
              dtype={idntt: VARCHAR(df.index.get_level_values(idntt).str.len().max())})
    conn.close()
    pass


def sql_to_df(one_sql, root, pw, db):
    basic_url = 'mysql+mysqldb://{root}:{pw}@localhost:3306/{db}?charset=utf8'.format(
        root=root, pw=pw, db=db)
    engine = create_engine(basic_url)
    conn = engine.connect()
    df = pd.read_sql(one_sql, conn)
    conn.close()
    return df


def get_html(url, decodes='utf-8'):
    # 第一次获取html,可能存在验证码
    rsp = requests.get(url, headers = get_headers())
    html = rsp.content.decode(decodes, 'ignore').replace('\t', '').replace('\n', '')
    return html


def get_article(html):
    dom = etree.HTML(html)
    article = dom.xpath(r'.//div[@id="contentMain"]//text()')
    article1 = ''
    for item in article:
        item = item.replace(' ', '').replace('\xa0', '')
        article1 += item
    return article1


def get_df(article, url):
    df = pd.DataFrame([article, url], index=['art', 'url']).T
    df = df.set_index(['url'], inplace=False)
    return df

first_link = sql_to_df("""select * from first_link9807""", 'root', 'pw', '光明日报')

link_list =list(first_link['url']) 
for url in link_list[:10]:
    try:
        html = get_html(url)
        article = get_article(html)
        one_link = get_df(article, url)
        df_to_sql(one_link, 'root', 'pw', '光明日报', 'second_page2', 'url')
    except:
        print('Error url %s'%url)

以上