Skip to content

Commit b0555b3

Browse files
committed
rfa body fetching test passed.
1 parent df914ab commit b0555b3

4 files changed

Lines changed: 65 additions & 2 deletions

File tree

internal/fetcher/body.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ func (p *Post) SetBody() error {
2323
if err := p.FmtBody(Voa); err != nil {
2424
return err
2525
}
26+
case "www.rfa.org":
27+
if err := p.FmtBody(Voa); err != nil {
28+
return err
29+
}
2630
}
2731
return nil
2832
}
@@ -128,3 +132,22 @@ func Voa(p *Post) (string, error) {
128132
body = strings.ReplaceAll(body, "br \n", "")
129133
return body, nil
130134
}
135+
136+
func Rfa(p *Post) (string, error) {
137+
doc := p.DOC
138+
body := ""
139+
// Fetch content nodes
140+
articleDoc := ElementsByTagAndClass(doc, "div", "wsw")
141+
if len(articleDoc) == 0 {
142+
return "", errors.New(`[-] There is no element match '<div class="wsw">'`)
143+
}
144+
plist := ElementsByTagName(articleDoc[0], "p")
145+
for _, v := range plist {
146+
body += v.FirstChild.Data + " \n"
147+
}
148+
body = strings.ReplaceAll(body, "strong \n", "")
149+
body = strings.ReplaceAll(body, "span \n", "")
150+
body = strings.ReplaceAll(body, "br \n", "")
151+
return body, nil
152+
153+
}

internal/fetcher/node.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,26 @@ func ElementsByTagAndClass(doc *html.Node, tag, class string) []*html.Node {
4040
return nodes
4141
}
4242

43+
func ElementsByTagAndId(doc *html.Node, tag, id string) []*html.Node {
44+
var nodes []*html.Node
45+
if tag == "" || id == "" {
46+
return nil
47+
}
48+
if doc.Type == html.ElementNode {
49+
if tag == doc.Data {
50+
for _, a := range doc.Attr {
51+
if a.Key == "id" && a.Val == id {
52+
nodes = append(nodes, doc)
53+
}
54+
}
55+
}
56+
}
57+
for c := doc.FirstChild; c != nil; c = c.NextSibling {
58+
nodes = append(nodes, ElementsByTagAndId(c, tag, id)...)
59+
}
60+
return nodes
61+
}
62+
4363
func ForEachNode(n *html.Node, pre, post func(n *html.Node)) {
4464
if pre != nil {
4565
pre(n)

internal/fetcher/node_test.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,24 @@ func TestElementsByTagAndClass(t *testing.T) {
2020
fmt.Println(v.FirstChild.Data)
2121
}
2222
}
23+
24+
func TestElementsByTagAndId(t *testing.T) {
25+
p := PostFactory("https://www.rfa.org/mandarin/yataibaodao/junshiwaijiao/jt-07022020105416.html")
26+
raw, doc, err := GetRawAndDoc(p.URL, 1*time.Minute)
27+
if err != nil {
28+
t.Errorf("GetDOC error: %v", err)
29+
}
30+
p.DOC = doc
31+
p.Raw = raw
32+
tc := ElementsByTagAndId(doc, "div", "storytext")
33+
plist := ElementsByTagName(tc[0], "p")
34+
for _, v := range plist {
35+
if v.FirstChild != nil {
36+
if v.FirstChild.Data == "b" {
37+
fmt.Println("**" + v.FirstChild.FirstChild.Data + "**")
38+
} else {
39+
fmt.Println(v.FirstChild.Data)
40+
}
41+
}
42+
}
43+
}

internal/fetcher/post.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,7 @@ func (p *Post) SetPost() error {
4545
if err != nil {
4646
return err
4747
}
48-
p.Raw = raw
49-
p.DOC = doc
48+
p.Raw, p.DOC = raw, doc
5049
// set Date
5150
if err := p.SetDate(); err != nil {
5251
return err

0 commit comments

Comments
 (0)