https://github.com/gocolly/colly
爬虫
package main
import (
"fmt"
"log"
"strconv"
"time"
"github.com/gocolly/colly"
)
func main() {
t := time.Now()
c := colly.NewCollector()
// Limit the number of threads started by colly to two
// when visiting links which domains' matches "*httpbin.*" glob
c.Limit(&colly.LimitRule{
DomainGlob: "*wufazhuce.*",
Parallelism: 2,
RandomDelay: 5 * time.Second,
})
// On every a element which has href attribute call callback
c.OnHTML("div.tab-content", func(e *colly.HTMLElement) {
// 插画地址
imageURL := e.ChildAttr("img", "src")
fmt.Printf("imageURL: %s \n", imageURL)
// 引言
citation := e.ChildText("div.one-cita")
fmt.Printf("citation: %s \n", citation)
// 标号
vol := e.ChildText("div.one-titulo")
fmt.Printf("vol: %s \n", vol)
// 发布日
publishDate := e.ChildText("p.dom") + " " + e.ChildText("p.may")
fmt.Printf("publishDate: %s \n", publishDate)
})
// Before making a request print "Visiting ..."
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL.String())
})
c.OnScraped(func(r *colly.Response) {
fmt.Println("Finished", r.Request.URL)
})
c.OnError(func(_ *colly.Response, err error) {
log.Println("Something went wrong:", err)
})
for i := 1; i < 2819; i++ {
c.Visit("http://wufazhuce.com/one/" + strconv.Itoa(i))
}
c.Wait()
fmt.Printf("花费时间:%s", time.Since(t))
}