1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
|
package main
import (
"archive/tar"
"compress/gzip"
"errors"
"flag"
"fmt"
"io"
"io/ioutil"
"log"
"net/http"
"net/url"
"os"
"path"
"path/filepath"
"strings"
"github.com/PuerkitoBio/goquery"
)
var (
label = flag.String("label", "img", "label to download")
)
var labelAttrMap = map[string]string{
"img": "src",
"script": "src",
"a": "href",
}
func CleanUrl(uri *url.URL, link string) string {
switch {
case strings.HasPrefix(link, "https") || strings.HasPrefix(link, "http"):
return link
case strings.HasPrefix(link, "//"):
return uri.Scheme + ":" + link
case strings.HasPrefix(link, "/"):
return fmt.Sprintf("%s://%s%s", uri.Scheme, uri.Host, link)
default:
p := strings.SplitAfter(uri.Path, "/")
path := strings.Join(p[:2], "") //一般情况是这样 ,/static/img/logo.png
return fmt.Sprintf("%s://%s%s%s", uri.Scheme, uri.Host, path, link)
}
}
func cleanUrls(u string, urls []string) []string {
var ret []string
uri, _ := url.Parse(u)
for i := range urls {
ret = append(ret, CleanUrl(uri, urls[i]))
}
return ret
}
func fetch(url string) ([]string, error) {
var urls []string
resp, err := http.Get(url)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, errors.New(resp.Status)
}
doc, err := goquery.NewDocumentFromResponse(resp)
if err != nil {
return nil, err
}
doc.Find(*label).Each(func(i int, s *goquery.Selection) {
link, ok := s.Attr(labelAttrMap[*label])
if ok {
urls = append(urls, link)
}
})
return urls, nil
}
func downloadImgs(urls []string, dir string) error {
for _, u := range urls {
resp, err := http.Get(u)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
continue
return errors.New(resp.Status)
}
fullname := filepath.Join(dir, path.Base(u))
f, err := os.Create(fullname)
if err != nil {
return err
}
defer f.Close()
io.Copy(f, resp.Body)
}
return nil
}
func maketar(dir string, w io.Writer) error {
basedir := filepath.Base(dir)
compress := gzip.NewWriter(w)
defer compress.Close()
tr := tar.NewWriter(compress)
defer tr.Close()
filepath.Walk(dir, func(name string, info os.FileInfo, err error) error {
// 写入tar的FileHeader
// 以读取的方式打开文件
// 判断目录和文件,如果是文件
// 把文件的内容写入到body
header, err := tar.FileInfoHeader(info, "")
if err != nil {
return err
}
p, _ := filepath.Rel(dir, name)
//fmt.Printf("dir:%s, name:%s, p:%s\n", dir, name, p)
header.Name = filepath.Join(basedir, p)
tr.WriteHeader(header)
if info.IsDir() {
return nil
}
f, err := os.Open(name)
if err != nil {
return err
}
defer f.Close()
io.Copy(tr, f)
return nil
})
return nil
}
func main() {
flag.Parse()
//url := "http://daily.zhihu.com/"
url := os.Args[1]
urls, err := fetch(url)
if err != nil {
log.Fatal(err)
}
urls = cleanUrls(url, urls)
tmpdir, err := ioutil.TempDir("", "spider")
if err != nil {
log.Fatal(err)
}
//fmt.Println(tmpdir)
//defer os.RemoveAll(tmpdir)
err = downloadImgs(urls, tmpdir)
if err != nil {
log.Panic(err)
}
f, err := os.Create("img.tar.gz")
if err != nil {
log.Fatal(err)
}
defer f.Close()
maketar(tmpdir, f)
}
|