NewCollector(options …func(Collector)) Collector
创建一个默认配置的新collector实例。
func NewCollector(options ...func(*Collector)) *Collector {c := &Collector{}c.Init()for _, f := range options {f(c)}c.parseSettingsFromEnv()return c}
Collector Struct
// Collector 为采集工作提供采集实例type Collector struct {// UserAgent 是 User-Agent 字符串,用于 HTTP requestUserAgent string// MaxDepth 限制访问URLs递归深度。// 关于无限递归设置它为0(默认)MaxDepth int// AllowDomains 是一个域名白名单。// 为空的话,表示可访问任何域名。AllowedDomains []string// DisallowedDomains 是一个域名黑名单。DisallowedDomains []string// DisallowedURLFilters 是一个正则表达式列表,它限制访问URLs。// 如果匹配到了URL,请求将被终止。// DiallowedURLFilters 在 URLFilters为空允许任何URLs访问之前进行评估。DisallowedURLFilters []*regexp.Regexp//URLFilters是一个限制访问URl的正则表达式列表。如果结果集中的任意一个匹配到的URl将被终止请求。//DiallowedURLFilters在URLFilters之前执行。//为空,则允许任何URLs被访问。URLFilters []*regexp.Regexp// AllowURLRevisit 允许多次下载同一个URLAllowURLRevisit bool// MaxBodySize 是限制response body字节大小// 0 表示没有限制。// MaxBodySize默认值是10MB(10 * 1024 * 1024 bytes)。MaxBodySize int// CacheDir 指定一个缓存GET请求缓存文件的存储地址。// 当它没有定义,则缓存不可用。CacheDir string// IgnoreRobotsTxt 允许 Collector忽略主机robots.txt文件中的规则。更多信息参见 http://www.robotstxt.org/IgnoreRobotsTxt bool// Async 启动异步网络通信。使用Collector.Wait()来确认所有请求都已经完成。Async bool// ParseHTTpErrorResponse 允许解析非2xx状态码的HTTP。// 默认,Colly只解析成功的HTTP响应。ParseHTTPErrorResponse 设置true开启它。ParseHTTPErrorResponse bool// ID是一个collector的唯一标识。ID uint32// 可以为非utf8响应体启用字符编码检测,而无需显式的字符集声明。该特性使用 https://github.com/saintfish/chardetDetectCharset bool// RedirectHandler 允许控制如何管理重定向RedirectHandler func(req *http.Request, via []*http.Request) error// CheckHead 每个GET预先验证响应之前执行一个HEAD请求。CheckHead boolstore storage.Storagedebugger debug.DebuggerrobotsMap map[string]*robotstxt.RobotsDatahtmlCallbacks []*htmlCallbackContainerxmlCallbacks []*xmlCallbackContainerrequestCallbacks []RequestCallbackresponseCallbacks []ResponseCallbackerrorCallbacks []ErrorCallbackscrapedCallbacks []ScrapedCallbackrequestCount uint32responseCount uint32backend *httpBackendwg *sync.WaitGrouplock *sync.RWMutex}
func(c *Collector) Init()
初始化 Collector 私有变量、集合和Collector 默认配置。
func (c *Collector) Init() {c.UserAgent = "colly - https://github.com/gocolly/colly"c.MaxDepth = 0c.store = &storage.InMemoryStorage{}c.store.Init()c.MaxBodySize = 10 * 1024 * 1024c.backend = &httpBackend{}jar, _ := cookiejar.New(nil)c.backend.Init(jar)c.backend.Client.CheckRedirect = c.checkRedirectFunc()c.wg = &sync.WaitGroup{}c.lock = &sync.RWMutex{}c.robotsMap = make(map[string]*robotstxt.RobotsData)c.IgnoreRobotsTxt = truec.ID = atomic.AddUint32(&collectorCounter, 1)}
func (c *Collector) Visit(URL string) error
Visit通过创建对参数中指定的URL的请求来启动Collector的收集作业。Visit还调用前面提供的回调。
func (c *Collector) Visit(URL string) error {if c.CheckHead {if check := c.scrape(URL, "HEAD", 1, nil, nil, nil, true); check != nil {return check}}return c.scrape(URL, "GET", 1, nil, nil, nil, true)}
func (c Collector) scrape(u, method string, depth int, requestData io.Reader, ctx Context, hdr http.Header, checkRevisit bool) error
func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, checkRevisit bool) error {if err := c.requestCheck(u, method, depth, checkRevisit); err != nil {return err}parsedURL, err := url.Parse(u)if err != nil {return err}if parsedURL.Scheme == "" {parsedURL.Scheme = "http"}if !c.isDomainAllowed(parsedURL.Host) {return ErrForbiddenDomain}if method != "HEAD" && !c.IgnoreRobotsTxt {if err = c.checkRobots(parsedURL); err != nil {return err}}if hdr == nil {hdr = http.Header{"User-Agent": []string{c.UserAgent}}}rc, ok := requestData.(io.ReadCloser)if !ok && requestData != nil {rc = ioutil.NopCloser(requestData)}req := &http.Request{Method: method,URL: parsedURL,Proto: "HTTP/1.1",ProtoMajor: 1,ProtoMinor: 1,Header: hdr,Body: rc,Host: parsedURL.Host,}setRequestBody(req, requestData)u = parsedURL.String()c.wg.Add(1)if c.Async {go c.fetch(u, method, depth, requestData, ctx, hdr, req)return nil}return c.fetch(u, method, depth, requestData, ctx, hdr, req)}
