01-百度贴吧爬虫

  1. package main
  2. import (
  3. "fmt"
  4. "strconv"
  5. "net/http"
  6. "io"
  7. "time"
  8. "os"
  9. )
  10. func HttpGet(url string) (result string, err error) {
  11. resp, err1 := http.Get(url)
  12. if err1 != nil {
  13. err = err1 // 将封装函数内部的错误,传出给调用者。
  14. return
  15. }
  16. defer resp.Body.Close()
  17. time.Sleep(time.Second)
  18. // 循环读取 网页数据, 传出给调用者
  19. buf := make([]byte, 4096)
  20. for {
  21. n, err2 := resp.Body.Read(buf)
  22. if n == 0 {
  23. fmt.Println("读取网页完成")
  24. break
  25. }
  26. if err2 != nil && err2 != io.EOF {
  27. err = err2
  28. return
  29. }
  30. // 累加每一次循环读到的 buf 数据,存入result 一次性返回。
  31. result += string(buf[:n])
  32. }
  33. return
  34. }
  35. // 爬取页面操作。
  36. func working(start, end int) {
  37. fmt.Printf("正在爬取第%d页到%d页....\n", start, end)
  38. // 循环爬取每一页数据
  39. for i:=start; i<=end; i++ {
  40. url := "https://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn="+ strconv.Itoa((i-1)*50)
  41. result, err := HttpGet(url)
  42. if err != nil {
  43. fmt.Println("HttpGet err:", err)
  44. continue
  45. }
  46. //fmt.Println("result=", result)
  47. // 将读到的整网页数据,保存成一个文件
  48. f, err := os.Create("第 " + strconv.Itoa(i) + " 页" + ".html")
  49. if err != nil {
  50. fmt.Println("Create err:", err)
  51. continue
  52. }
  53. f.WriteString(result)
  54. f.Close() // 保存好一个文件,关闭一个文件。
  55. }
  56. }
  57. func main() {
  58. // 指定爬取起始、终止页
  59. var start, end int
  60. fmt.Print("请输入爬取的起始页(>=1):")
  61. fmt.Scan(&start)
  62. fmt.Print("请输入爬取的终止页(>=start):")
  63. fmt.Scan(&end)
  64. working(start, end)
  65. }

02-百度贴吧-并发爬虫

package main

import (
    "fmt"
    "net/http"
    "io"
    "time"
    "strconv"
    "os"
)

func HttpGet2(url string) (result string, err error) {
    resp, err1 := http.Get(url)
    if err1 != nil {
        err = err1                // 将封装函数内部的错误,传出给调用者。
        return
    }
    defer resp.Body.Close()

    time.Sleep(time.Second)

    // 循环读取 网页数据, 传出给调用者
    buf := make([]byte, 4096)
    for {
        n, err2 := resp.Body.Read(buf)
        if n == 0 {
            fmt.Println("读取网页完成")
            break
        }
        if err2 != nil && err2 != io.EOF {
            err = err2
            return
        }
        // 累加每一次循环读到的 buf 数据,存入result 一次性返回。
        result += string(buf[:n])
    }
    return
}
// 爬取单个页面的函数
func SpiderPage(i int, page chan int)  {
    url := "https://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn="+ strconv.Itoa((i-1)*50)
    result, err := HttpGet2(url)
    if err != nil {
        fmt.Println("HttpGet err:", err)
        return
    }
    //fmt.Println("result=", result)
    // 将读到的整网页数据,保存成一个文件
    f, err := os.Create("第 " + strconv.Itoa(i) + " 页" + ".html")
    if err != nil {
        fmt.Println("Create err:", err)
        return
    }
    f.WriteString(result)
    f.Close()                // 保存好一个文件,关闭一个文件。

    page <- i                // 与主go程完成同步。
}

// 爬取页面操作。
func working2(start, end int)  {
    fmt.Printf("正在爬取第%d页到%d页....\n", start, end)

    page := make(chan int)

    // 循环爬取每一页数据
    for i:=start; i<=end; i++ {
        go SpiderPage(i, page)
    }

    for i:=start; i<=end; i++ {
        fmt.Printf("第 %d 个页面爬取完成\n", <-page)
    }
}

func main()  {
    // 指定爬取起始、终止页
    var start, end int
    fmt.Print("请输入爬取的起始页(>=1):")
    fmt.Scan(&start)
    fmt.Print("请输入爬取的终止页(>=start):")
    fmt.Scan(&end)

    working2(start, end)
}

03-豆瓣电影-爬取并发版

package main

import (
    "fmt"
    "strconv"
    "net/http"
    "io"
    "regexp"
    "os"
)

// 爬取指定url 的页面,返回 result
func HttpGetDB(url string) (result string, err error)  {

    resp, err1 := http.Get(url)
    if err1 != nil {
        err = err1;
        return
    }
    defer resp.Body.Close()

    buf := make([]byte, 4096)
    // 循环爬取整页数据
    for {
        n, err2 := resp.Body.Read(buf)
        if n == 0 {
            break
        }
        if err2 != nil && err2 != io.EOF {
            err = err2
            return
        }
        result += string(buf[:n])
    }
    return
}

func Save2file(idx int, filmName, filmScore, peopleNum [][]string)  {
    path := "C:/itcast/"+ "第 " + strconv.Itoa(idx) + " 页.txt"
    f, err := os.Create(path)
    if err != nil {
        fmt.Println("os.Create err:", err)
        return
    }
    defer f.Close()

    n := len(filmName)         // 得到 条目数。 应该是 25
    // 先打印 抬头  电影名称        评分         评分人数
    f.WriteString("电影名称" + "\t\t\t" + "评分" + "\t\t" + "评分人数" + "\n")
    for i:=0; i<n; i++ {
        f.WriteString(filmName[i][1] + "\t\t\t" + filmScore[i][1] + "\t\t" + peopleNum[i][1] + "\n")
    }
}

// 爬取一个豆瓣页面数据信息
func SpiderPageDB(idx int, page chan int)  {
    // 获取 url
    url := "https://movie.douban.com/top250?start="+ strconv.Itoa((idx-1)*25) + "&filter="

    // 封装 HttpGet2 爬取 url 对应页面
    result, err := HttpGetDB(url)
    if err != nil {
        fmt.Println("HttpGet2 err:", err)
        return
    }
    //fmt.Println("result=", result)
    // 解析、编译正则表达式 —— 电影名称:
    ret1:= regexp.MustCompile(`<img width="100" alt="(?s:(.*?))"`)
    // 提取需要信息
    filmName := ret1.FindAllStringSubmatch(result, -1)

    // 解析、编译正则表达式 —— 分数:
    pattern := `<span class="rating_num" property="v:average">(?s:(.*?))</span>`
    ret2 := regexp.MustCompile(pattern)
    // 提取需要信息
    filmScore := ret2.FindAllStringSubmatch(result, -1)

    // 解析、编译正则表达式 —— 评分人数:
    ret3 := regexp.MustCompile(`<span>(?s:(\d*?))人评价</span>`)
    //ret3 := regexp.MustCompile(`<span>(.*?)人评价</span>`)

    // 提取需要信息
    peopleNum := ret3.FindAllStringSubmatch(result, -1)

    // 将提取的有用信息,封装到文件中。
    Save2file(idx, filmName, filmScore, peopleNum)

    // 与主go程配合 完成同步
    page <- idx
}

func toWork(start, end int)  {
    fmt.Printf("正在爬取 %d 到 %d 页...\n", start, end)

    page := make(chan int)        //防止主go 程提前结束

    for i:=start; i<=end; i++ {
        go SpiderPageDB(i, page)
    }

    for i:=start; i<=end; i++ {
        fmt.Printf("第 %d 页爬取完毕\n", <-page)
    }
}

func main()  {
    // 指定爬取起始、终止页
    var start, end int
    fmt.Print("请输入爬取的起始页(>=1):")
    fmt.Scan(&start)
    fmt.Print("请输入爬取的终止页(>=start):")
    fmt.Scan(&end)

    toWork(start, end)
}

04-自己写的下载文件(并发版)

package main

import (
    "fmt"
    "io"
    "net/http"
    "os"
    "regexp"
)

func curlwoking(url string) (result string, err error) {
    client := &http.Client{}
    req, err1 := http.NewRequest("GET", url, nil)
    if err1 != nil {
        err = err1
        return
    }
    req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36")

    resp, err2 := client.Do(req)
    if err2 != nil {
        err = err2
        return
    }
    defer resp.Body.Close()

    buf := make([]byte, 4096)
    for {
        n, err3 := resp.Body.Read(buf)
        if n == 0 {
            break
        }
        if err3 != nil && err3 != io.EOF {
            err = err3
            return
        }
        result += string(buf[:n])
    }
    return
}

func GoWget(url string, filename [][]string, page chan string, i, n int) {
    url2 := url + filename[i][1]
    res, err := http.Get(url2)
    if err != nil {
        fmt.Println("http.Get err:", err)
        return
    }
    f, err := os.Create(filename[i][1])
    if err != nil {
        fmt.Println("os.Create err:", err)
        return
    }
    io.Copy(f, res.Body)
    fmt.Printf("正在下载第%d,共%d个", i, n)
    page <- filename[i][1] + "下载完毕"
}

func Save2file(filename [][]string, url string) {
    page := make(chan string)
    n := len(filename)
    for i := 0; i < n; i++ {
        go GoWget(url, filename, page, i, n)
    }
    for i := 0; i < n; i++ {
        fmt.Println(<-page)
    }
}

//for i := 0; i < n; i++ {
//    fmt.Println(<-page)
//}

func GowoKing(url string) {
    result, err := curlwoking(url)
    if err != nil {
        fmt.Println("GowoKing err:", err)
        return
    }
    ret := regexp.MustCompile(`alt="\[   \]"></td><td><a href="(.*?)"`)
    filmName := ret.FindAllStringSubmatch(result, -1)
    Save2file(filmName, url)

}

func main() {
    url := "http://update.cs2c.com.cn:8080/NS/V10/V10SP1/os/adv/lic/addons/x86_64/repodata/"
    //url := "https://www.baidu.com/"
    GowoKing(url)

}

05-自己写的下载文件(非并发版)

package main

import (
    "fmt"
    "io"
    "net/http"
    "os"
    "regexp"
    "time"
)

func curlwoking(url string) (result string, err error) {
    client := &http.Client{}
    req, err1 := http.NewRequest("GET", url, nil)
    if err1 != nil {
        err = err1
        return
    }
    req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36")

    resp, err2 := client.Do(req)
    if err2 != nil {
        err = err2
        return
    }
    defer resp.Body.Close()

    buf := make([]byte, 4096)
    for {
        n, err3 := resp.Body.Read(buf)
        if n == 0 {
            break
        }
        if err3 != nil && err3 != io.EOF {
            err = err3
            return
        }
        result += string(buf[:n])
    }
    return
}

func GoWget(url string, filename [][]string, i, n int) {
    url2 := url + filename[i][1]
    res, err := http.Get(url2)
    if err != nil {
        fmt.Println("http.Get err:", err)
        return
    }
    f, err := os.Create("./download/" + filename[i][1])
    if err != nil {
        fmt.Println("os.Create err:", err)
        return
    }
    io.Copy(f, res.Body)
    fmt.Printf(filename[i][1] + " 下载完毕,")
    fmt.Printf("正在下载第%d个,共%d个\n", i+1, n)
    time.Sleep(time.Second)

}

func Save2file(filename [][]string, url string) {
    n := len(filename)
    for i := 0; i < n; i++ {
        GoWget(url, filename, i, n)
    }
}

//for i := 0; i < n; i++ {
//    fmt.Println(<-page)
//}

func GowoKing(url string) {
    result, err := curlwoking(url)
    if err != nil {
        fmt.Println("GowoKing err:", err)
        return
    }
    ret := regexp.MustCompile(`alt="\[   \]"></td><td><a href="(.*?)"`)
    filmName := ret.FindAllStringSubmatch(result, -1)
    Save2file(filmName, url)

}

func main() {
    var a string
    url := "http://update.cs2c.com.cn:8080/NS/V10/V10SP1/os/adv/lic/appstore/aarch64/Packages/"
    //url := "https://www.baidu.com/"
    os.Mkdir("download", 0777)
    GowoKing(url)
    fmt.Println("文件已全部下载完成,请手动关闭")
    fmt.Scan(&a)

}

06-爬取并发小说网站

package main

import (
    "fmt"
    "io"
    "net/http"
    "os"
    "regexp"
    "strconv"
    "strings"
)

// HttpGet 获取一个网页所有的内容, result 返回
func HttpGet(url string) (result string, err error) {
    //页面读取
    client := &http.Client{}
    req, err1 := http.NewRequest("GET", url, nil)
    if err1 != nil {
        err = err1
        return
    }
    req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36")

    resp, err2 := client.Do(req)
    if err2 != nil {
        err = err2
        return
    }
    defer resp.Body.Close()

    buf := make([]byte, 4096)
    for {
        n, err3 := resp.Body.Read(buf)
        if n == 0 {
            break
        }
        if err3 != nil && err3 != io.EOF {
            err = err3
            return
        }
        result += string(buf[:n])
    }
    return
}

func saveJoke2File(title string, filetitle, filecontent []string) {
    path := "D:/小说抓取/" + title + ".txt"
    f, err := os.Create(path)
    if err != nil {
        fmt.Println("os.Create err:", err)
        return
    }
    defer f.Close()
    n := len(filetitle)
    for i := 0; i < n; i++ {
        f.WriteString(filetitle[i] + "\n" + filecontent[i] + "\n")
        f.WriteString("--------------------------------------\n")
        fmt.Println(filetitle[i] + "写入完毕")
    }
}

func OnePage(idx int, page chan int) {
    // 拼接URL
    url := "http://www.zongheng.com/rank/details.html?rt=6&d=1&p=" + strconv.Itoa(idx)
    // 封装函数获取段子的URL
    result, err := HttpGet(url)
    if err != nil {
        fmt.Println("HttpGet err:", err)
        return
    }
    //匹配空行  \r\n.*
    // 解析、编译正则
    //获取每篇文章url

    //创建用户存储的title、content的切片,初始容量为0
    filetitle := make([]string, 0)
    filecontent := make([]string, 0)

    str := `<div class="rank_d_book_img fl" title=".*">\r\n.*<a href="(?s:(.*?))"`
    ret := regexp.MustCompile(str)
    // 提取需要信息 —— 每个段子的 URL
    alls := ret.FindAllStringSubmatch(result, -1)
    //fmt.Println(alls)
    for _, MatchText := range alls {
        //fmt.Println("MathchText[1]:", MatchText[1])
        //替换第二个匹配的字符串
        newMatchText := replaceNth(MatchText[1], "book", "showchapter", 2)
        //fmt.Println("newMatchText:", newMatchText)
        Urls, title, err := SpiderJokerPage(newMatchText)
        if err != nil {
            fmt.Println("SpiderJokerPage err:", err)
            continue
        }
        //fmt.Println(title)
        //把获取到url分割为切片
        newels := strings.Split(Urls, "\n")
        //fmt.Println("Urls:", newels)
        for i := 0; i < len(newels); i++ {
            //爬取每个笑话的title和content
            title1, content1, err := readtitleconn(newels[i])
            if err != nil {
                fmt.Println("readtitleconn err:", err)
                continue
            }
            //fmt.Println("title1:", title1)
            //fmt.Println("content1:", content1)

            filetitle = append(filetitle, title1)
            filecontent = append(filecontent, content1)
            //fmt.Println("title:", filetitle)
            //fmt.Println("conn:", filecontent)
            saveJoke2File(title, filetitle, filecontent)

        }

    }
    page <- idx
}

func readtitleconn(url string) (title1, content1 string, err error) {
    result, err1 := HttpGet(url)
    if err != nil {
        err = err1
        return
    }
    ret1 := regexp.MustCompile(`<div class="title_txtbox">(?s:(.*?))</div>`)
    alls1 := ret1.FindAllStringSubmatch(result, -1)
    for _, Temtiter := range alls1 {
        //fmt.Println(Temtiter[1])
        title1 = Temtiter[1]
    }
    ret2 := regexp.MustCompile(`<div class="content" itemprop="acticleBody">(?s:(.*?))</div>`)
    alls2 := ret2.FindAllStringSubmatch(result, -1)
    for _, Temconn := range alls2 {
        content1 = Temconn[1]
        content1 = strings.Replace(content1, "<p>", "", -1)
        content1 = strings.Replace(content1, "</p>", "", -1)
        content1 = strings.Replace(content1, " ", "", -1)

    }
    return
}

// replaceNth 替换指定位置的字符串
func replaceNth(s, old, new string, n int) string {
    i := 0
    for m := 1; m <= n; m++ {
        x := strings.Index(s[i:], old)
        if x < 0 {
            break
        }
        i += x
        if m == n {
            return s[:i] + new + s[i+len(old):]
        }
        i += len(old)
    }
    return s
}

func SpiderJokerPage(url string) (Urls, title string, err error) {
    result, err1 := HttpGet(url)
    if err1 != nil {
        err = err1
        return
    }
    ret := regexp.MustCompile(`<li class=" col-4">\r\n.*<a  href="(?s:(.*?))"`)
    alls := ret.FindAllStringSubmatch(result, -1)
    //获取网站目录子目录所匹配的连接
    for _, Tempurl := range alls {
        Urls += Tempurl[1] + "\n"
    }

    ret2 := regexp.MustCompile(`<h1>(?s:(.*?))</h1>`)
    alls2 := ret2.FindAllStringSubmatch(result, -1)
    for _, Temtitle := range alls2 {
        title = Temtitle[1]
    }
    return
}

func ToWorking(start, end int) {
    fmt.Printf("正在爬取第%d和%d页。。。\n", start, end)

    page := make(chan int)
    //抓取起始页数据
    for i := start; i <= end; i++ {
        go OnePage(i, page)
    }

    for i := start; i <= end; i++ {
        fmt.Printf("第 %d 页爬取完毕\n", <-page)
    }

}

func main() {
    //编写起始页函数
    var start, end int
    fmt.Printf("请输入爬取的起始页(>=1):")
    fmt.Scan(&start)
    fmt.Printf("请输入爬取的终止页(>=start)")
    fmt.Scan(&end)

    ToWorking(start, end)

}