编程语言
996
版权所有,转载请注明:http://www.lenggirl.com/language/go-picture.html
使用准备
1.安装Golang
2.下载爬虫包
go get -v github.com/hunterhug/marmot/expert go get -v github.com/hunterhug/marmot/miner go get -v github.com/hunterhug/parrot/util
程序
该程序只能抓取HTML中src="http"
中的图片, 必须带有协议头http(s)
, 其他如data-src
和混淆在JS中的无法抓取
See: https://github.com/hunterhug/marmot/blob/master/example/lesson/lesson6.go
package main import ( "errors" "fmt" "net/url" "strings" "github.com/hunterhug/marmot/expert" "github.com/hunterhug/marmot/miner" "github.com/hunterhug/parrot/util" ) // Num of miner, We can run it at the same time to crawl data fast var MinerNum = 5 // You can update this decide whether to proxy var ProxyAddress interface{} func main() { // You can Proxy! // ProxyAddress = "socks5://127.0.0.1:1080" fmt.Println(`Welcome: Input "url" and picture keep "dir"`) for { fmt.Println("---------------------------------------------") url := util.Input(`URL(Like: "http://publicdomainarchive.com")`, "http://publicdomainarchive.com") dir := util.Input(`DIR(Default: "./picture")`, "./picture") fmt.Printf("You will keep %s picture in dir %s\n", url, dir) fmt.Println("---------------------------------------------") // Start Catch err := CatchPicture(url, dir) if err != nil { fmt.Println("Error:" + err.Error()) } } } // Come on! func CatchPicture(picture_url string, dir string) error { // Check valid _, err := url.Parse(picture_url) if err != nil { return err } // Make dir! err = util.MakeDir(dir) if err != nil { return err } // New a worker to get url worker, _ := miner.New(ProxyAddress) result, err := worker.SetUrl(picture_url).SetUa(miner.RandomUa()).Get() if err != nil { return err } // Find all picture pictures := expert.FindPicture(string(result)) // Empty, What a pity! if len(pictures) == 0 { return errors.New("empty") } // Devide pictures into several worker xxx, _ := util.DevideStringList(pictures, MinerNum) // Chanel to info exchange chs := make(chan int, len(pictures)) // Go at the same time for num, imgs := range xxx { // Get pool miner worker_picture, ok := miner.Pool.Get(util.IS(num)) if !ok { // No? set one! worker_temp, _ := miner.New(ProxyAddress) worker_picture = worker_temp worker_temp.SetUa(miner.RandomUa()) miner.Pool.Set(util.IS(num), worker_temp) } // Go save picture! go func(imgs []string, worker *miner.Worker, num int) { for _, img := range imgs { // Check, May be Pass _, err := url.Parse(img) if err != nil { continue } // Change Name of our picture filename := strings.Replace(util.ValidFileName(img), "#", "_", -1) // Exist? if util.FileExist(dir + "/" + filename) { fmt.Println("File Exist:" + dir + "/" + filename) chs <- 0 } else { // Not Exsit? imgsrc, e := worker.SetUrl(img).Get() if e != nil { fmt.Println("Download " + img + " error:" + e.Error()) chs <- 0 return } // Save it! e = util.SaveToFile(dir+"/"+filename, imgsrc) if e == nil { fmt.Printf("SP%d: Keep in %s/%s\n", num, dir, filename) } chs <- 1 } } }(imgs, worker_picture, num) } // Every picture should return for i := 0; i < len(pictures); i++ { <-chs } return nil }
解释均写, 运行后:
superpika@superpika-chen-110:~/code/src/github.com/hunterhug/marmot/example/lesson$ go run lesson6.go Welcome: Input "url" and picture keep "dir" --------------------------------------------- URL(Like: "http://publicdomainarchive.com") DIR(Default: "./picture") You will keep http://publicdomainarchive.com picture in dir ./picture --------------------------------------------- SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_02_03_modern.jpg SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_03_03_google_dark.png SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-003-667x1000-192684_667x675.jpg superpika@superpika-chen-110:~/code/src/github.com/hunterhug/marmot/example/lesson$ go run lesson6.go Welcome: Input "url" and picture keep "dir" --------------------------------------------- URL(Like: "http://publicdomainarchive.com") DIR(Default: "./picture") You will keep http://publicdomainarchive.com picture in dir ./picture --------------------------------------------- SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_02_03_modern.jpg SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_03_03_google_dark.png SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-003-667x1000-192684_667x675.jpg SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_05_03_powered-by-wp-engine.png SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_05_03_divi.png SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-002-1000x667.jpg SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_05_03_public-domain-mark.png SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_01_03_public-domain-images-free-stock-photos008-1000x625.jpg SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_09_03_Weekly.jpg SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_11_03_free-stock-photos-public-domain-images-054-1000x667.jpg SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_10_03_instagram_dark.png SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_02_03_vintage.jpg SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-001-1000x667.jpg SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_11_03_free-stock-photos-public-domain-images-070-1000x667.jpg SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_03_03_twitter02_dark.png SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_01_03_public-domain-images-free-stock-photos001-1000x750-167066_1000x675.jpg SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-035-1000x667.jpg SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_03_03_facebook_dark.png SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_11_03_free-stock-photos-public-domain-images-060-1000x667.jpg SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-013-1000x667.jpg --------------------------------------------- URL(Like: "http://publicdomainarchive.com") SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_05_03_powered-by-wp-engine.png SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_05_03_divi.png SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-002-1000x667.jpg SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_05_03_public-domain-mark.png SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_01_03_public-domain-images-free-stock-photos008-1000x625.jpg SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_09_03_Weekly.jpg SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_11_03_free-stock-photos-public-domain-images-054-1000x667.jpg SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_10_03_instagram_dark.png SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_02_03_vintage.jpg SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-001-1000x667.jpg SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_11_03_free-stock-photos-public-domain-images-070-1000x667.jpg SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_03_03_twitter02_dark.png SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_01_03_public-domain-images-free-stock-photos001-1000x750-167066_1000x675.jpg SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-035-1000x667.jpg SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_03_03_facebook_dark.png SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_11_03_free-stock-photos-public-domain-images-060-1000x667.jpg SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-013-1000x667.jpg --------------------------------------------- URL(Like: "http://publicdomainarchive.com")
[图片上传失败...(image-685ed8-1557639583356)]