-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathoptions.go
99 lines (88 loc) · 2.31 KB
/
options.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
package crawler
import (
"net/http"
"strings"
"sync"
"github.com/pkg/errors"
)
// Option is used to provide optional configuration to a crawler
type Option func(*options) error
type options struct {
maxDepth int
transport http.RoundTripper
checkFetch []CheckFetchFunc
goroutines int
}
// WithConcurrentRequests sets how many concurrent requests to allow
func WithConcurrentRequests(n int) Option {
return func(opts *options) error {
if n <= 0 {
return errors.Errorf("number of workers must be a positive integer. was: %d", n)
}
opts.goroutines = n
return nil
}
}
// WithHTTPTransport sets the optional http client
func WithHTTPTransport(rt http.RoundTripper) Option {
return func(opts *options) error {
opts.transport = rt
return nil
}
}
// WithMaxDepth sets the max depth of the crawl. It must be over zero or
// the call will panic.
func WithMaxDepth(depth int) Option {
if depth <= 0 {
panic("depth should always be greater or than zero")
}
return WithCheckFetch(func(req *Request) bool {
return req.depth <= depth
})
}
// WithCheckFetch takes CheckFetchFunc that will be run before fetching each page to check whether it should be fetched or not
func WithCheckFetch(fn CheckFetchFunc) Option {
return func(opts *options) error {
opts.checkFetch = append(opts.checkFetch, fn)
return nil
}
}
// WithOneRequestPerURL adds a check to only allow URLs once
func WithOneRequestPerURL() Option {
var mut sync.Mutex
v := make(map[string]struct{})
return WithCheckFetch(func(req *Request) bool {
mut.Lock()
defer mut.Unlock()
_, ok := v[req.URL.String()]
if ok {
return false
}
v[req.URL.String()] = struct{}{}
return true
})
}
// WithAllowedHosts adds a check to only allow URLs with the given hosts
func WithAllowedHosts(hosts ...string) Option {
m := make(map[string]struct{})
for _, h := range hosts {
h = strings.TrimSpace(h)
m[h] = struct{}{}
}
return WithCheckFetch(func(req *Request) bool {
_, ok := m[req.URL.Host]
return ok
})
}
// WithExcludedHosts adds a check to only allow URLs with hosts other than the given ones
func WithExcludedHosts(hosts ...string) Option {
m := make(map[string]struct{})
for _, h := range hosts {
h = strings.TrimSpace(h)
m[h] = struct{}{}
}
return WithCheckFetch(func(req *Request) bool {
_, ok := m[req.URL.Host]
return !ok
})
}