https://github.com/gocolly/colly

    爬虫

    1. package main
    2. import (
    3. "fmt"
    4. "log"
    5. "strconv"
    6. "time"
    7. "github.com/gocolly/colly"
    8. )
    9. func main() {
    10. t := time.Now()
    11. c := colly.NewCollector()
    12. // Limit the number of threads started by colly to two
    13. // when visiting links which domains' matches "*httpbin.*" glob
    14. c.Limit(&colly.LimitRule{
    15. DomainGlob: "*wufazhuce.*",
    16. Parallelism: 2,
    17. RandomDelay: 5 * time.Second,
    18. })
    19. // On every a element which has href attribute call callback
    20. c.OnHTML("div.tab-content", func(e *colly.HTMLElement) {
    21. // 插画地址
    22. imageURL := e.ChildAttr("img", "src")
    23. fmt.Printf("imageURL: %s \n", imageURL)
    24. // 引言
    25. citation := e.ChildText("div.one-cita")
    26. fmt.Printf("citation: %s \n", citation)
    27. // 标号
    28. vol := e.ChildText("div.one-titulo")
    29. fmt.Printf("vol: %s \n", vol)
    30. // 发布日
    31. publishDate := e.ChildText("p.dom") + " " + e.ChildText("p.may")
    32. fmt.Printf("publishDate: %s \n", publishDate)
    33. })
    34. // Before making a request print "Visiting ..."
    35. c.OnRequest(func(r *colly.Request) {
    36. fmt.Println("Visiting", r.URL.String())
    37. })
    38. c.OnScraped(func(r *colly.Response) {
    39. fmt.Println("Finished", r.Request.URL)
    40. })
    41. c.OnError(func(_ *colly.Response, err error) {
    42. log.Println("Something went wrong:", err)
    43. })
    44. for i := 1; i < 2819; i++ {
    45. c.Visit("http://wufazhuce.com/one/" + strconv.Itoa(i))
    46. }
    47. c.Wait()
    48. fmt.Printf("花费时间:%s", time.Since(t))
    49. }