@@ -17,18 +17,18 @@ import (
1717type Fetcher struct {
1818 Entrance string
1919 Links []string
20- Posts []ThePost
20+ LinksNew []string
21+ LinksOld []string
2122}
2223
2324type ThePost struct {
24- Entrance string
25- Domain string
26- URL string
27- DOC * html.Node
28- Raw []byte
29- Title string
30- Body string
31- Date string
25+ Domain string
26+ URL string
27+ DOC * html.Node
28+ Raw []byte
29+ Title string
30+ Body string
31+ Date string
3232}
3333
3434type Paragraph struct {
@@ -111,6 +111,7 @@ func (f *Fetcher) SetLinks() error {
111111 }
112112 links , err := ExtractLinks (url .String ())
113113 if err != nil {
114+ log .Printf (`can't extract links from "%s": %s` , url , err )
114115 return err
115116 }
116117 links = gears .StrSliceDeDupl (links )
@@ -129,11 +130,10 @@ func (f *Fetcher) SetLinks() error {
129130 f .Links = append (f .Links , link )
130131 }
131132 }
132-
133- }
134- for i , l := range f .Links {
135- fmt .Printf ("%2d: %s\n " , i + 1 , l )
136133 }
134+ // for i, l := range f.Links {
135+ // fmt.Printf("%2d: %s\n", i+1, l)
136+ // }
137137 return nil
138138}
139139
@@ -145,48 +145,53 @@ func LinksFilter(links []string, regex string) []string {
145145 return flinks
146146}
147147
148- // WaitForServer attempts to contact the server of a URL.
149- // It tries for one minute using exponential back-off.
150- // It reports an error if all attemps fail.
151- func (post * ThePost ) WaitForServer () error {
152- const timeout = 1 * time .Minute
153- deadline := time .Now ().Add (timeout )
154- for tries := 0 ; time .Now ().Before (deadline ); tries ++ {
155- _ , err := http .Head (post .URL )
156- if err == nil {
157- return err // success
158- }
159- log .SetPrefix ("[wait]" )
160- log .SetFlags (0 )
161- log .Printf ("server not responding (%s); retrying..." , err )
162- time .Sleep (time .Second << uint (tries )) // exponential back-off
148+ func FetcherFactory (site string ) * Fetcher {
149+ return & Fetcher {
150+ Entrance : site ,
151+ Links : nil ,
152+ LinksNew : nil ,
153+ LinksOld : nil ,
154+ }
155+ }
156+
157+ func ThePostFactory (url string ) * ThePost {
158+ return & ThePost {
159+ URL : url ,
163160 }
164- return fmt .Errorf ("server %s failed to respond after %s" , post .URL , timeout )
165161}
166162
167163// breadthFirst calls f for each item in the worklist.
168164// Any items returned by f are added to the worklist.
169165// f is called at most once for each item.
170166// breadthFirst(crawl, os.Args[1:])
171- func breadthFirst (f func (item string ) [] string , worklist []string ) {
167+ func breadthFirst (f func (item string ) error , worklist []string ) {
172168 seen := make (map [string ]bool )
173169 for len (worklist ) > 0 {
174170 items := worklist
175171 worklist = nil
176172 for _ , item := range items {
177173 if ! seen [item ] {
178174 seen [item ] = true
179- worklist = append (worklist , f (item )... )
175+ f (item )
176+ worklist = items
177+ // worklist = append(worklist, f(item)...)
180178 }
181179 }
182180 }
183181}
184182
185- func crawl (url string ) []string {
186- fmt .Println (url )
187- list , err := ExtractLinks (url )
188- if err != nil {
189- log .Printf (`can't extract links from "%s": %s` , url , err )
183+ func crawl (url string ) error {
184+ f := FetcherFactory (url )
185+ log .Printf ("[*] Deal with: [%s]\n " , url )
186+ log .Println ("[*] Fetch links ..." )
187+ if err := f .SetLinks (); err != nil {
188+ log .Println (err )
189+ return err
190190 }
191- return list
191+ // Set LinksNew
192+ f .LinksNew = gears .StrSliceDiff (f .Links , f .LinksOld )
193+ // GetNews then compare via md5 and Save or Rewrite news exist
194+ // Set LinksOld
195+ f .LinksOld = f .Links
196+ return nil
192197}
0 commit comments