Skip to content

Commit 6ee7003

Browse files
committed
fetcher factory and actions pass test, but post factory need code.
1 parent bfaa7c3 commit 6ee7003

2 files changed

Lines changed: 58 additions & 48 deletions

File tree

internal/fetcher/fetcher.go

Lines changed: 42 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -17,18 +17,18 @@ import (
1717
type Fetcher struct {
1818
Entrance string
1919
Links []string
20-
Posts []ThePost
20+
LinksNew []string
21+
LinksOld []string
2122
}
2223

2324
type ThePost struct {
24-
Entrance string
25-
Domain string
26-
URL string
27-
DOC *html.Node
28-
Raw []byte
29-
Title string
30-
Body string
31-
Date string
25+
Domain string
26+
URL string
27+
DOC *html.Node
28+
Raw []byte
29+
Title string
30+
Body string
31+
Date string
3232
}
3333

3434
type Paragraph struct {
@@ -111,6 +111,7 @@ func (f *Fetcher) SetLinks() error {
111111
}
112112
links, err := ExtractLinks(url.String())
113113
if err != nil {
114+
log.Printf(`can't extract links from "%s": %s`, url, err)
114115
return err
115116
}
116117
links = gears.StrSliceDeDupl(links)
@@ -129,11 +130,10 @@ func (f *Fetcher) SetLinks() error {
129130
f.Links = append(f.Links, link)
130131
}
131132
}
132-
133-
}
134-
for i, l := range f.Links {
135-
fmt.Printf("%2d: %s\n", i+1, l)
136133
}
134+
// for i, l := range f.Links {
135+
// fmt.Printf("%2d: %s\n", i+1, l)
136+
// }
137137
return nil
138138
}
139139

@@ -145,48 +145,53 @@ func LinksFilter(links []string, regex string) []string {
145145
return flinks
146146
}
147147

148-
// WaitForServer attempts to contact the server of a URL.
149-
// It tries for one minute using exponential back-off.
150-
// It reports an error if all attemps fail.
151-
func (post *ThePost) WaitForServer() error {
152-
const timeout = 1 * time.Minute
153-
deadline := time.Now().Add(timeout)
154-
for tries := 0; time.Now().Before(deadline); tries++ {
155-
_, err := http.Head(post.URL)
156-
if err == nil {
157-
return err // success
158-
}
159-
log.SetPrefix("[wait]")
160-
log.SetFlags(0)
161-
log.Printf("server not responding (%s); retrying...", err)
162-
time.Sleep(time.Second << uint(tries)) // exponential back-off
148+
func FetcherFactory(site string) *Fetcher {
149+
return &Fetcher{
150+
Entrance: site,
151+
Links: nil,
152+
LinksNew: nil,
153+
LinksOld: nil,
154+
}
155+
}
156+
157+
func ThePostFactory(url string) *ThePost {
158+
return &ThePost{
159+
URL: url,
163160
}
164-
return fmt.Errorf("server %s failed to respond after %s", post.URL, timeout)
165161
}
166162

167163
// breadthFirst calls f for each item in the worklist.
168164
// Any items returned by f are added to the worklist.
169165
// f is called at most once for each item.
170166
// breadthFirst(crawl, os.Args[1:])
171-
func breadthFirst(f func(item string) []string, worklist []string) {
167+
func breadthFirst(f func(item string) error, worklist []string) {
172168
seen := make(map[string]bool)
173169
for len(worklist) > 0 {
174170
items := worklist
175171
worklist = nil
176172
for _, item := range items {
177173
if !seen[item] {
178174
seen[item] = true
179-
worklist = append(worklist, f(item)...)
175+
f(item)
176+
worklist = items
177+
// worklist = append(worklist, f(item)...)
180178
}
181179
}
182180
}
183181
}
184182

185-
func crawl(url string) []string {
186-
fmt.Println(url)
187-
list, err := ExtractLinks(url)
188-
if err != nil {
189-
log.Printf(`can't extract links from "%s": %s`, url, err)
183+
func crawl(url string) error {
184+
f := FetcherFactory(url)
185+
log.Printf("[*] Deal with: [%s]\n", url)
186+
log.Println("[*] Fetch links ...")
187+
if err := f.SetLinks(); err != nil {
188+
log.Println(err)
189+
return err
190190
}
191-
return list
191+
// Set LinksNew
192+
f.LinksNew = gears.StrSliceDiff(f.Links, f.LinksOld)
193+
// GetNews then compare via md5 and Save or Rewrite news exist
194+
// Set LinksOld
195+
f.LinksOld = f.Links
196+
return nil
192197
}

internal/fetcher/fetcher_test.go

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,20 @@ import (
44
"testing"
55
)
66

7-
func TestSetLinks(t *testing.T) {
8-
var f = &Fetcher{
9-
Entrance: "https://www.rfa.org/mandarin/",
10-
// Entrance: "https://www.voachinese.com",
11-
Links: nil,
12-
Posts: nil,
13-
}
14-
err := f.SetLinks()
15-
if err != nil {
16-
t.Errorf("SetLinks fail!\n%s", err)
17-
}
7+
// func TestSetLinks(t *testing.T) {
8+
// var f = &Fetcher{
9+
// Entrance: "https://www.rfa.org/mandarin/",
10+
// // Entrance: "https://www.voachinese.com",
11+
// Links: nil,
12+
// LinksNew: nil,
13+
// LinksOld: nil,
14+
// }
15+
// err := f.SetLinks()
16+
// if err != nil {
17+
// t.Errorf("SetLinks fail!\n%s", err)
18+
// }
19+
// }
20+
21+
func TestCrawl(t *testing.T) {
22+
breadthFirst(crawl, []string{"https://www.rfa.org/mandarin/"})
1823
}

0 commit comments

Comments
 (0)