用了goquery 和regexp两个包
用法如query:
dom,err:=goquery.NewDocumentFromReader(strings.NewReader(result))if err!=nil{fmt.Println("HttpGet err :",err)}dom.Find(".Programlist .Cont ul p").Each(func(i int, selection *goquery.Selection) {// if selection != nil {title += selection.Text() + "r"titleS = append(titleS,selection.Text())// }})
regexp用法:
// fmt.Println(video_cont)rel2 := regexp.MustCompile(`"title":"流畅","url":"(.*?)"`)if rel2 == nil {fmt.Println("准备好了12")}arr2 := rel2.FindAllStringSubmatch(video_cont,-1) //获取的数据是二维的切片
package main
import ("fmt""strconv""net/http""os""regexp""strings""github/PuerkitoBio/goquery" "database/sql"_ "github/go-sql-driver/mysql"
)
// type collectionmwd struct {
// ID int64 `db:"id"`
// title string `db:title`
// cover string `db:"cover"` //由于在mysql的users表中name没有设置为NOT NULL,所以name可能为null,在查询过程中会返回nil,如果是string类型则无法接收nil,但sql.NullString则可以接收nil值
// videoUrl string `db:"videourl"`
// }
const (USERNAME = "root"PASSWORD = "root"NETWORK = "tcp"SERVER = "localhost"PORT = 3306DATABASE = "guanfu_school"
)
func main () {var start ,end int fmt.Printf("请输入起始页:(2013开始,2019结束)")fmt.Scan(&start)fmt.Printf("请输入结束页:(2013开始,2019结束)")fmt.Scan(&end)// image := []string{}// fmt.Println(len(image))Dowork(start,end)// title := []string{"12321321","dwefdsfsd","萨芬就开始放假都是放到数据库"}// path := []string{"12321321","dwefdsfsd","萨芬就开始放假都是放到数据库"}// fmt.Println(image[2])// fmt.Println(title[2])// fmt.Println(path[2])// insertData(image,title,path)
}func Dowork (start , end int) {// returnfmt.Println("正在爬取数据")// var title string//开始循环每个年份 //然后每个年份再循环找每一页的数据for i:=start;i<=end;i++ {for j:=1;j<=3;j++ {//写入文件var img stringvar title string var path string //插入数据库数据var imgS = make([]string,0)var titleS = make([]string,0)var pathS = make([]string,0)var url = fmt.Sprintf("/%s/%s.html",strconv.Itoa(i),strconv.Itoa(j)) //获取一年中每页的内容result,err := HttpGet(url)if err != nil {fmt.Println("HttpGet err :",err)break}// 过滤标题dom,err:=goquery.NewDocumentFromReader(strings.NewReader(result))if err!=nil{fmt.Println("HttpGet err :",err)}dom.Find(".Programlist .Cont ul p").Each(func(i int, selection *goquery.Selection) {// if selection != nil {title += selection.Text() + "r"titleS = append(titleS,selection.Text())// }})fmt.Println("title:",title)fmt.Println("titleS",titleS)// 过滤封面urlimgReg := regexp.MustCompile(`<img src="(.*?)"`)if imgReg == nil {fmt.Println("没有封面图")}imgMap := imgReg.FindAllStringSubmatch(result,-1)fmt.Println("imgMap:",imgMap)for k,data := range imgMap {if k > 1 {img += data[1] + "r"imgS = append(imgS,data[1])}}fmt.Println("img :",img)fmt.Println("imgS :",imgS)//找到详情页路径 再爬取代码rel := regexp.MustCompile(`<li><a href="(.*?)" `)if rel == nil {fmt.Println("准备好了吗")}arr := rel.FindAllStringSubmatch(result,-1)// fmt.Println("全部的路径:",arr)for _,data := range arr {fmt.Println("url=",data[1])video_url := ""+data[1]video_cont,err := HttpGet(video_url)if err != nil {fmt.Println("准备好了")}// fmt.Println(video_cont)rel2 := regexp.MustCompile(`"title":"流畅","url":"(.*?)"`)if rel2 == nil {fmt.Println("准备好了12")}arr2 := rel2.FindAllStringSubmatch(video_cont,-1) //获取的数据是二维的切片for _,d := range arr2 {fmt.Println("高清视频链接是",d)// if d[1] != "" {path += d[1]+"n"pathS = append(pathS,d[1])// } }}fmt.Println("path:",path)fmt.Println("pathS:",pathS)//把内容写入到文件fileName := strconv.Itoa(i)+"_"+strconv.Itoa(j)+".txt"// fileName := "path_"+strconv.Itoa(i)+"_"+strconv.Itoa(j)+".txt"if title != "" && img != "" && path != "" {f,err1 := os.Create(fileName)if err1 != nil {fmt.Println("os create error")continue}// for i:=0;i<=len(titleMap);i++ {f.WriteString(title)f.WriteString(img)f.WriteString(path)insertData(imgS,titleS,pathS,strconv.Itoa(i))// }f.Close()}}}
}func HttpGet(url string) (result string ,err error) {respon,err1 := http.Get(url)if err1 != nil {err = err1return }defer respon.Body.Close()//读取网页的内容 ?buf := make([]byte,1024*4)for {n,err2 := respon.Body.Read(buf)if n == 0{ //说明读取完毕fmt.Println("resp111 body.Read err",err2)break}result += string(buf[:n])}return
}func insertData(img []string, title []string, url []string, year string) {DB,err := DB()if err != nil {fmt.Println("数据库连接失败:",err)}fmt.Println("img的的长度是:",len(img))fmt.Println("title的的长度是:",len(title))fmt.Println("url",len(url))for i:=0;i<len(img);i++ {if len(img) == 0 || len(title) == 0 || len(url) == 0 {continue}result,err := DB.Exec("insert INTO collectionmwd(cover,title,videoUrl,years) values(?,?,?,?)",img[i],title[i],url[i],year);if err != nil {fmt.Printf("Insert into 数据 failed err %d:%v", i,err)continue}lastInsertId,err := result.LastInsertId()if err != nil {fmt.Printf("get lastInsertID failed :%v",lastInsertId)continue }// rowsaffected,err := result.RowsAffected()}// fmt.Println("RowsAffected:",rowsaffected)
}
func DB() (DB *sql.DB,err error) {dsn := fmt.Sprintf("%s:%s@%s(%s:%d)/%s",USERNAME,PASSWORD,NETWORK,SERVER,PORT,DATABASE)DB,err = sql.Open("mysql",dsn)if err != nil{fmt.Printf("Open mysql failed,err:%vn",err)return}fmt.Println("数据库运行到这里了")return
}
本文发布于:2024-02-01 08:26:28,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170674718835230.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |