三方库 - colly - 《Golang 学习笔记》

爬虫
package main
import (
    "fmt"
    "log"
    "strconv"
    "time"
    "github.com/gocolly/colly"
)
func main() {
    t := time.Now()
    c := colly.NewCollector()
    // Limit the number of threads started by colly to two
    // when visiting links which domains' matches "*httpbin.*" glob
    c.Limit(&colly.LimitRule{
        DomainGlob:  "*wufazhuce.*",
        Parallelism: 2,
        RandomDelay: 5 * time.Second,
    })
    // On every a element which has href attribute call callback
    c.OnHTML("div.tab-content", func(e *colly.HTMLElement) {
        // 插画地址
        imageURL := e.ChildAttr("img", "src")
        fmt.Printf("imageURL: %s \n", imageURL)
        // 引言
        citation := e.ChildText("div.one-cita")
        fmt.Printf("citation: %s \n", citation)
        // 标号
        vol := e.ChildText("div.one-titulo")
        fmt.Printf("vol: %s \n", vol)
        // 发布日
        publishDate := e.ChildText("p.dom") + " " + e.ChildText("p.may")
        fmt.Printf("publishDate: %s \n", publishDate)
    })
    // Before making a request print "Visiting ..."
    c.OnRequest(func(r *colly.Request) {
        fmt.Println("Visiting", r.URL.String())
    })
    c.OnScraped(func(r *colly.Response) {
        fmt.Println("Finished", r.Request.URL)
    })
    c.OnError(func(_ *colly.Response, err error) {
        log.Println("Something went wrong:", err)
    })
    for i := 1; i < 2819; i++ {
        c.Visit("http://wufazhuce.com/one/" + strconv.Itoa(i))
    }
    c.Wait()
    fmt.Printf("花费时间:%s", time.Since(t))
}