01-百度贴吧爬虫
package main
import (
"fmt"
"strconv"
"net/http"
"io"
"time"
"os"
)
func HttpGet(url string) (result string, err error) {
resp, err1 := http.Get(url)
if err1 != nil {
err = err1 // 将封装函数内部的错误,传出给调用者。
return
}
defer resp.Body.Close()
time.Sleep(time.Second)
// 循环读取 网页数据, 传出给调用者
buf := make([]byte, 4096)
for {
n, err2 := resp.Body.Read(buf)
if n == 0 {
fmt.Println("读取网页完成")
break
}
if err2 != nil && err2 != io.EOF {
err = err2
return
}
// 累加每一次循环读到的 buf 数据,存入result 一次性返回。
result += string(buf[:n])
}
return
}
// 爬取页面操作。
func working(start, end int) {
fmt.Printf("正在爬取第%d页到%d页....\n", start, end)
// 循环爬取每一页数据
for i:=start; i<=end; i++ {
url := "https://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn="+ strconv.Itoa((i-1)*50)
result, err := HttpGet(url)
if err != nil {
fmt.Println("HttpGet err:", err)
continue
}
//fmt.Println("result=", result)
// 将读到的整网页数据,保存成一个文件
f, err := os.Create("第 " + strconv.Itoa(i) + " 页" + ".html")
if err != nil {
fmt.Println("Create err:", err)
continue
}
f.WriteString(result)
f.Close() // 保存好一个文件,关闭一个文件。
}
}
func main() {
// 指定爬取起始、终止页
var start, end int
fmt.Print("请输入爬取的起始页(>=1):")
fmt.Scan(&start)
fmt.Print("请输入爬取的终止页(>=start):")
fmt.Scan(&end)
working(start, end)
}
02-百度贴吧-并发爬虫
package main
import (
"fmt"
"net/http"
"io"
"time"
"strconv"
"os"
)
func HttpGet2(url string) (result string, err error) {
resp, err1 := http.Get(url)
if err1 != nil {
err = err1 // 将封装函数内部的错误,传出给调用者。
return
}
defer resp.Body.Close()
time.Sleep(time.Second)
// 循环读取 网页数据, 传出给调用者
buf := make([]byte, 4096)
for {
n, err2 := resp.Body.Read(buf)
if n == 0 {
fmt.Println("读取网页完成")
break
}
if err2 != nil && err2 != io.EOF {
err = err2
return
}
// 累加每一次循环读到的 buf 数据,存入result 一次性返回。
result += string(buf[:n])
}
return
}
// 爬取单个页面的函数
func SpiderPage(i int, page chan int) {
url := "https://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn="+ strconv.Itoa((i-1)*50)
result, err := HttpGet2(url)
if err != nil {
fmt.Println("HttpGet err:", err)
return
}
//fmt.Println("result=", result)
// 将读到的整网页数据,保存成一个文件
f, err := os.Create("第 " + strconv.Itoa(i) + " 页" + ".html")
if err != nil {
fmt.Println("Create err:", err)
return
}
f.WriteString(result)
f.Close() // 保存好一个文件,关闭一个文件。
page <- i // 与主go程完成同步。
}
// 爬取页面操作。
func working2(start, end int) {
fmt.Printf("正在爬取第%d页到%d页....\n", start, end)
page := make(chan int)
// 循环爬取每一页数据
for i:=start; i<=end; i++ {
go SpiderPage(i, page)
}
for i:=start; i<=end; i++ {
fmt.Printf("第 %d 个页面爬取完成\n", <-page)
}
}
func main() {
// 指定爬取起始、终止页
var start, end int
fmt.Print("请输入爬取的起始页(>=1):")
fmt.Scan(&start)
fmt.Print("请输入爬取的终止页(>=start):")
fmt.Scan(&end)
working2(start, end)
}
03-豆瓣电影-爬取并发版
package main
import (
"fmt"
"strconv"
"net/http"
"io"
"regexp"
"os"
)
// 爬取指定url 的页面,返回 result
func HttpGetDB(url string) (result string, err error) {
resp, err1 := http.Get(url)
if err1 != nil {
err = err1;
return
}
defer resp.Body.Close()
buf := make([]byte, 4096)
// 循环爬取整页数据
for {
n, err2 := resp.Body.Read(buf)
if n == 0 {
break
}
if err2 != nil && err2 != io.EOF {
err = err2
return
}
result += string(buf[:n])
}
return
}
func Save2file(idx int, filmName, filmScore, peopleNum [][]string) {
path := "C:/itcast/"+ "第 " + strconv.Itoa(idx) + " 页.txt"
f, err := os.Create(path)
if err != nil {
fmt.Println("os.Create err:", err)
return
}
defer f.Close()
n := len(filmName) // 得到 条目数。 应该是 25
// 先打印 抬头 电影名称 评分 评分人数
f.WriteString("电影名称" + "\t\t\t" + "评分" + "\t\t" + "评分人数" + "\n")
for i:=0; i<n; i++ {
f.WriteString(filmName[i][1] + "\t\t\t" + filmScore[i][1] + "\t\t" + peopleNum[i][1] + "\n")
}
}
// 爬取一个豆瓣页面数据信息
func SpiderPageDB(idx int, page chan int) {
// 获取 url
url := "https://movie.douban.com/top250?start="+ strconv.Itoa((idx-1)*25) + "&filter="
// 封装 HttpGet2 爬取 url 对应页面
result, err := HttpGetDB(url)
if err != nil {
fmt.Println("HttpGet2 err:", err)
return
}
//fmt.Println("result=", result)
// 解析、编译正则表达式 —— 电影名称:
ret1:= regexp.MustCompile(`<img width="100" alt="(?s:(.*?))"`)
// 提取需要信息
filmName := ret1.FindAllStringSubmatch(result, -1)
// 解析、编译正则表达式 —— 分数:
pattern := `<span class="rating_num" property="v:average">(?s:(.*?))</span>`
ret2 := regexp.MustCompile(pattern)
// 提取需要信息
filmScore := ret2.FindAllStringSubmatch(result, -1)
// 解析、编译正则表达式 —— 评分人数:
ret3 := regexp.MustCompile(`<span>(?s:(\d*?))人评价</span>`)
//ret3 := regexp.MustCompile(`<span>(.*?)人评价</span>`)
// 提取需要信息
peopleNum := ret3.FindAllStringSubmatch(result, -1)
// 将提取的有用信息,封装到文件中。
Save2file(idx, filmName, filmScore, peopleNum)
// 与主go程配合 完成同步
page <- idx
}
func toWork(start, end int) {
fmt.Printf("正在爬取 %d 到 %d 页...\n", start, end)
page := make(chan int) //防止主go 程提前结束
for i:=start; i<=end; i++ {
go SpiderPageDB(i, page)
}
for i:=start; i<=end; i++ {
fmt.Printf("第 %d 页爬取完毕\n", <-page)
}
}
func main() {
// 指定爬取起始、终止页
var start, end int
fmt.Print("请输入爬取的起始页(>=1):")
fmt.Scan(&start)
fmt.Print("请输入爬取的终止页(>=start):")
fmt.Scan(&end)
toWork(start, end)
}
04-自己写的下载文件(并发版)
package main
import (
"fmt"
"io"
"net/http"
"os"
"regexp"
)
func curlwoking(url string) (result string, err error) {
client := &http.Client{}
req, err1 := http.NewRequest("GET", url, nil)
if err1 != nil {
err = err1
return
}
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36")
resp, err2 := client.Do(req)
if err2 != nil {
err = err2
return
}
defer resp.Body.Close()
buf := make([]byte, 4096)
for {
n, err3 := resp.Body.Read(buf)
if n == 0 {
break
}
if err3 != nil && err3 != io.EOF {
err = err3
return
}
result += string(buf[:n])
}
return
}
func GoWget(url string, filename [][]string, page chan string, i, n int) {
url2 := url + filename[i][1]
res, err := http.Get(url2)
if err != nil {
fmt.Println("http.Get err:", err)
return
}
f, err := os.Create(filename[i][1])
if err != nil {
fmt.Println("os.Create err:", err)
return
}
io.Copy(f, res.Body)
fmt.Printf("正在下载第%d,共%d个", i, n)
page <- filename[i][1] + "下载完毕"
}
func Save2file(filename [][]string, url string) {
page := make(chan string)
n := len(filename)
for i := 0; i < n; i++ {
go GoWget(url, filename, page, i, n)
}
for i := 0; i < n; i++ {
fmt.Println(<-page)
}
}
//for i := 0; i < n; i++ {
// fmt.Println(<-page)
//}
func GowoKing(url string) {
result, err := curlwoking(url)
if err != nil {
fmt.Println("GowoKing err:", err)
return
}
ret := regexp.MustCompile(`alt="\[ \]"></td><td><a href="(.*?)"`)
filmName := ret.FindAllStringSubmatch(result, -1)
Save2file(filmName, url)
}
func main() {
url := "http://update.cs2c.com.cn:8080/NS/V10/V10SP1/os/adv/lic/addons/x86_64/repodata/"
//url := "https://www.baidu.com/"
GowoKing(url)
}
05-自己写的下载文件(非并发版)
package main
import (
"fmt"
"io"
"net/http"
"os"
"regexp"
"time"
)
func curlwoking(url string) (result string, err error) {
client := &http.Client{}
req, err1 := http.NewRequest("GET", url, nil)
if err1 != nil {
err = err1
return
}
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36")
resp, err2 := client.Do(req)
if err2 != nil {
err = err2
return
}
defer resp.Body.Close()
buf := make([]byte, 4096)
for {
n, err3 := resp.Body.Read(buf)
if n == 0 {
break
}
if err3 != nil && err3 != io.EOF {
err = err3
return
}
result += string(buf[:n])
}
return
}
func GoWget(url string, filename [][]string, i, n int) {
url2 := url + filename[i][1]
res, err := http.Get(url2)
if err != nil {
fmt.Println("http.Get err:", err)
return
}
f, err := os.Create("./download/" + filename[i][1])
if err != nil {
fmt.Println("os.Create err:", err)
return
}
io.Copy(f, res.Body)
fmt.Printf(filename[i][1] + " 下载完毕,")
fmt.Printf("正在下载第%d个,共%d个\n", i+1, n)
time.Sleep(time.Second)
}
func Save2file(filename [][]string, url string) {
n := len(filename)
for i := 0; i < n; i++ {
GoWget(url, filename, i, n)
}
}
//for i := 0; i < n; i++ {
// fmt.Println(<-page)
//}
func GowoKing(url string) {
result, err := curlwoking(url)
if err != nil {
fmt.Println("GowoKing err:", err)
return
}
ret := regexp.MustCompile(`alt="\[ \]"></td><td><a href="(.*?)"`)
filmName := ret.FindAllStringSubmatch(result, -1)
Save2file(filmName, url)
}
func main() {
var a string
url := "http://update.cs2c.com.cn:8080/NS/V10/V10SP1/os/adv/lic/appstore/aarch64/Packages/"
//url := "https://www.baidu.com/"
os.Mkdir("download", 0777)
GowoKing(url)
fmt.Println("文件已全部下载完成,请手动关闭")
fmt.Scan(&a)
}
06-爬取并发小说网站
package main
import (
"fmt"
"io"
"net/http"
"os"
"regexp"
"strconv"
"strings"
)
// HttpGet 获取一个网页所有的内容, result 返回
func HttpGet(url string) (result string, err error) {
//页面读取
client := &http.Client{}
req, err1 := http.NewRequest("GET", url, nil)
if err1 != nil {
err = err1
return
}
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36")
resp, err2 := client.Do(req)
if err2 != nil {
err = err2
return
}
defer resp.Body.Close()
buf := make([]byte, 4096)
for {
n, err3 := resp.Body.Read(buf)
if n == 0 {
break
}
if err3 != nil && err3 != io.EOF {
err = err3
return
}
result += string(buf[:n])
}
return
}
func saveJoke2File(title string, filetitle, filecontent []string) {
path := "D:/小说抓取/" + title + ".txt"
f, err := os.Create(path)
if err != nil {
fmt.Println("os.Create err:", err)
return
}
defer f.Close()
n := len(filetitle)
for i := 0; i < n; i++ {
f.WriteString(filetitle[i] + "\n" + filecontent[i] + "\n")
f.WriteString("--------------------------------------\n")
fmt.Println(filetitle[i] + "写入完毕")
}
}
func OnePage(idx int, page chan int) {
// 拼接URL
url := "http://www.zongheng.com/rank/details.html?rt=6&d=1&p=" + strconv.Itoa(idx)
// 封装函数获取段子的URL
result, err := HttpGet(url)
if err != nil {
fmt.Println("HttpGet err:", err)
return
}
//匹配空行 \r\n.*
// 解析、编译正则
//获取每篇文章url
//创建用户存储的title、content的切片,初始容量为0
filetitle := make([]string, 0)
filecontent := make([]string, 0)
str := `<div class="rank_d_book_img fl" title=".*">\r\n.*<a href="(?s:(.*?))"`
ret := regexp.MustCompile(str)
// 提取需要信息 —— 每个段子的 URL
alls := ret.FindAllStringSubmatch(result, -1)
//fmt.Println(alls)
for _, MatchText := range alls {
//fmt.Println("MathchText[1]:", MatchText[1])
//替换第二个匹配的字符串
newMatchText := replaceNth(MatchText[1], "book", "showchapter", 2)
//fmt.Println("newMatchText:", newMatchText)
Urls, title, err := SpiderJokerPage(newMatchText)
if err != nil {
fmt.Println("SpiderJokerPage err:", err)
continue
}
//fmt.Println(title)
//把获取到url分割为切片
newels := strings.Split(Urls, "\n")
//fmt.Println("Urls:", newels)
for i := 0; i < len(newels); i++ {
//爬取每个笑话的title和content
title1, content1, err := readtitleconn(newels[i])
if err != nil {
fmt.Println("readtitleconn err:", err)
continue
}
//fmt.Println("title1:", title1)
//fmt.Println("content1:", content1)
filetitle = append(filetitle, title1)
filecontent = append(filecontent, content1)
//fmt.Println("title:", filetitle)
//fmt.Println("conn:", filecontent)
saveJoke2File(title, filetitle, filecontent)
}
}
page <- idx
}
func readtitleconn(url string) (title1, content1 string, err error) {
result, err1 := HttpGet(url)
if err != nil {
err = err1
return
}
ret1 := regexp.MustCompile(`<div class="title_txtbox">(?s:(.*?))</div>`)
alls1 := ret1.FindAllStringSubmatch(result, -1)
for _, Temtiter := range alls1 {
//fmt.Println(Temtiter[1])
title1 = Temtiter[1]
}
ret2 := regexp.MustCompile(`<div class="content" itemprop="acticleBody">(?s:(.*?))</div>`)
alls2 := ret2.FindAllStringSubmatch(result, -1)
for _, Temconn := range alls2 {
content1 = Temconn[1]
content1 = strings.Replace(content1, "<p>", "", -1)
content1 = strings.Replace(content1, "</p>", "", -1)
content1 = strings.Replace(content1, " ", "", -1)
}
return
}
// replaceNth 替换指定位置的字符串
func replaceNth(s, old, new string, n int) string {
i := 0
for m := 1; m <= n; m++ {
x := strings.Index(s[i:], old)
if x < 0 {
break
}
i += x
if m == n {
return s[:i] + new + s[i+len(old):]
}
i += len(old)
}
return s
}
func SpiderJokerPage(url string) (Urls, title string, err error) {
result, err1 := HttpGet(url)
if err1 != nil {
err = err1
return
}
ret := regexp.MustCompile(`<li class=" col-4">\r\n.*<a href="(?s:(.*?))"`)
alls := ret.FindAllStringSubmatch(result, -1)
//获取网站目录子目录所匹配的连接
for _, Tempurl := range alls {
Urls += Tempurl[1] + "\n"
}
ret2 := regexp.MustCompile(`<h1>(?s:(.*?))</h1>`)
alls2 := ret2.FindAllStringSubmatch(result, -1)
for _, Temtitle := range alls2 {
title = Temtitle[1]
}
return
}
func ToWorking(start, end int) {
fmt.Printf("正在爬取第%d和%d页。。。\n", start, end)
page := make(chan int)
//抓取起始页数据
for i := start; i <= end; i++ {
go OnePage(i, page)
}
for i := start; i <= end; i++ {
fmt.Printf("第 %d 页爬取完毕\n", <-page)
}
}
func main() {
//编写起始页函数
var start, end int
fmt.Printf("请输入爬取的起始页(>=1):")
fmt.Scan(&start)
fmt.Printf("请输入爬取的终止页(>=start)")
fmt.Scan(&end)
ToWorking(start, end)
}