-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathGetChinaNews.py
More file actions
345 lines (318 loc) · 12.9 KB
/
Copy pathGetChinaNews.py
File metadata and controls
345 lines (318 loc) · 12.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
#coding=UTF-8
import re,urllib,socket,os,datetime,sys,time
from sgmllib import SGMLParser
#20140102增加内容:添加文件列表配置项,统一输出文件的编码(IFENG的默认输出为UTF8,调整为gb2312),得到获取的新闻列表并按照发表时间排序输出到newslist.txt
reload(sys)
sys.setdefaultencoding('utf8')
"""
默认站点列表,各站点的标签及其说明如下:
中国新闻网(ZXW)
网易新闻(163)
人民网(RMW)
新浪(SINA)
凤凰资讯(IFENG)
"""
#下载配置
defaultSiteList = ["ZXW","163","RMW","SINA","IFENG"] #新闻源站点设置
argD = os.getcwd()+os.path.sep+'dataNews'#default目录
newsListFilePath = os.getcwd()+os.path.sep
#默认开始结束时间
defaultStartTime = "2013-11-02"
defaultEndTime = "2013-11-03"
#默认Url连接超时时间
defaultSockTimeLimit = 20
#定义提取的div的属性值,每个网站不一样
dirForDiv = {'ZXW':['class','left_zw'],'163':['id','endText'],'RMW':['id','p_content'],'SINA':['id','artibody'],'IFENG':['id','main_content']}
#定义从滚动新闻页面提取出新闻Url的正则表达式
dirRegex = {'ZXW':r'<div class=\"dd_bt\"><a href=[^<>]*>[^<>]*</a></div><div class=\"dd_time\">[\d]{1,2}-[\d]{1,2} [\d]{2}:[\d]{2}</div>','RMW':r'<a href[^<>]*>[^<>]*</a>\[[\d]{2}[^<>]*[\d]{2}:[\d]{2}\]<br>','163':r't\"[^}]*','SINA':r',title[^}]*','IFENG':r'<h4>[\d]{2}/[\d]{2} [\d]{2}:[\d]{2}</h4><a href=[^<>]*>[^<]*'}
"""
各个网站的滚动新闻页面:
ZXW:"http://www.chinanews.com/scroll-news/" + Year + "/" + Month + Day + "/news.shtml"
163:"http://people.com.cn/GB/24hour/index" + Year + "_" + Month + "_" + Day +".html"
RMW:"http://snapshot.news.163.com/wgethtml/http+!!news.163.com!special!0001220O!news_json.js/"+Year+"-"+Month+"/"+Day+"/0.js"
SINA:"http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=89&spec=&type=&date="+Year+"-"+Month+"-"+Day+"&k=&offset_page=0&offset_num=0&num=30000&asc=&page=1"
IFENG:"http://news.ifeng.com/rt-channel/rtlist_"+Year+Month+Day+"/"+str(pageNum)+".shtml"
"""
class GetNewsParser(SGMLParser):
"""
继承SGMLParser
提取出新闻的正文内容
"""
def __init__(self,site="163"):
SGMLParser.__init__(self)
self.site = site
def reset(self):
self.newsText = []
self.flag = False
self.getdata = False
self.verbatim = 0
SGMLParser.reset(self)
def start_div(self,attrs):
if self.flag == True:
self.verbatim += 1
return
for k,v in attrs:
if k == dirForDiv[self.site][0] and v == dirForDiv[self.site][1]:
self.flag = True
return
def end_div(self):
if self.verbatim == 0:
self.flag = False
if self.flag == True:
self.verbatim -= 1
def start_p(self,attrs):
if self.flag == False:
return
self.getdata = True
def end_p(self):
if self.getdata:
self.getdata = False
def start_script(self,attrs):
if self.getdata and self.site == "ZXW":
self.getdata = False
def handle_data(self,text):
if self.getdata:
self.newsText.append(text)
class GetChinaNews():
def __init__(self,str_start_time=defaultStartTime,str_end_time=defaultEndTime,dirName=os.getcwd(),siteList=defaultSiteList,timeLimit=defaultSockTimeLimit):
#s:str_start_time,e:str_end_time,d:dirName,l:sitelist,t:timeLimit
self.date_range = self.dateRange(str_start_time,str_end_time)
self.dir_name = dirName
self.root_dir_name = dirName
self.siteList = siteList
socket.setdefaulttimeout(timeLimit)
self.strYear = "2013"
self.strMonth = "11"
self.strDay = "02"
self.Url = "This is the roll news page Url."
self.tag = 0
self.newsList = []
#set the date range that get news
def dateRange(self,str_start_time,str_end_time):
"""
set the date range
"""
tmp = str_start_time.split('-')
tmp1 = str_end_time.split('-')
start_time = datetime.datetime(int(tmp[0]),int(tmp[1]),int(tmp[2]))
end_time = datetime.datetime(int(tmp1[0]),int(tmp1[1]),int(tmp1[2]))
for n in range(int((end_time-start_time).days)):
yield start_time + datetime.timedelta(n)
def getNewsProperties(self,site,str):
"""
return the list [newsTitle,newsUrl,newsTime]
"""
if site == "ZXW" or site == "RMW" or site == "IFENG":
delimiter = '>'
else:
delimiter = '"'
iList = str.split(delimiter)
if site == "ZXW":
return [iList[2][:-3],iList[1][9:-1],self.strYear+'-'+self.strMonth+'-'+self.strDay+' '+iList[5].split(' ')[1][:-5]+":00"]
elif site == "RMW":
return [iList[1][:-3],iList[0].split(' ')[1][6:-1],re.subn(" "," ",self.strYear + '-'+self.strMonth+'-'+self.strDay)[0]]
elif site == "163":
return [iList[2],iList[6],iList[10]]
elif site == "SINA":
return [iList[1],iList[3],time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(int(iList[4][-10:])))]
elif site == "IFENG":
return [iList[3],iList[2].split('"')[1],self.strYear+'-'+self.strMonth+'-'+self.strDay+' '+iList[1][6:11]]
def getNewsFromRollPage(self,site):
htmlSource = ""
try:
sock = urllib.urlopen(self.Url)
htmlSource = sock.read()
sock.close()
except:
print "Something wrong when openning rollNewsPage:"+self.Url
#solve the 404 problem
if site == "IFENG":
NonePattern = re.compile(r"div class=\"mat\"[^>]*")
if NonePattern.search(htmlSource):
return False
m = re.findall(dirRegex[site],htmlSource,re.M)
newsUrl = "NONE"
for i in m:
try:
#从网页上获取新闻属性
newsTitle,newsUrl,newsTime = self.getNewsProperties(site,i)
#special for ZXW
if site == "ZXW" and newsUrl[:4] != "http":
newsUrl = "http://www.chinanews.com" + newsUrl
#special for 163 because there are many other days' news on the rollNewsUrl
if site == "163" and newsTime.split(' ')[0] != self.strYear + '-' + self.strMonth + '-' + self.strDay:
continue
#notconsider the photo news
if site == "IFENG" and newsUrl[22:27] == "photo":
continue
sock = urllib.urlopen(newsUrl)
htmlSource = sock.read()
sock.close()
#special for 163: for Pagination news(the news is too long so 163 make it shown in two or more pages)
if site == "163":
allPattern = re.compile(r"_all.html#p1")
allUrl = allPattern.search(htmlSource)
if allUrl:
newsUrl = newsUrl[:-5]+allUrl.group()
socks = urllib.urlopen(newsUrl)
htmlSource = socks.read()
socks.close()
#newsTime of RMW
if site == "RMW":
pp = re.compile(r'[\d]{2}:[\d]{2}<')
RmwTime = pp.search(htmlSource).group()
newsTime = newsTime + " " + RmwTime[:-1]+":00"
#凤凰资讯的获取的时间没有秒
if site == "IFENG":
newsTime += ":00"
#use Parser to get text content of news,stored in strText
getcontent = GetNewsParser(site)
getcontent.feed(htmlSource)
strText = ""
for k in getcontent.newsText:
strText += k
getcontent.close()
fileName = site+"-"+str(self.tag)
self.tag = self.tag + 1
#store in .txt files
txtSource = newsTitle+"\n"+newsUrl+"\n"+newsTime+"\n"+strText
#将IFENG的默认编码设置为gb2312(与大部分一致)
if site == "IFENG":
newsTitle = newsTitle.encode('gb2312','ignore')
txtSource = txtSource.encode('gb2312','ignore')
with open(self.dir_name+fileName+'.txt','w+') as f:
f.write(txtSource)
#将获取的新闻保存以便排序,新闻属性包括(标题、文件路径、发表时间、来源网站)
stime = time.mktime(time.strptime(newsTime, '%Y-%m-%d %H:%M:%S'))
self.newsList.append((newsTitle,self.dir_name+fileName+'.txt',stime,site))
except KeyboardInterrupt:
print "Stopped By User."
sys.exit(0)
except:
print "SomethinWrong when getting "+site+" news:"+newsUrl
return True
def getNewsFromSite(self,site):
self.tag = 0
#the rollNewsUrl of every site
if site == "ZXW":
self.Url = "http://www.chinanews.com/scroll-news/" + self.strYear + "/" + self.strMonth + self.strDay + "/news.shtml"
self.getNewsFromRollPage(site)
elif site == "RMW":
self.Url = "http://people.com.cn/GB/24hour/index" + self.strYear + "_" + self.strMonth + "_" + self.strDay +".html"
self.getNewsFromRollPage(site)
elif site == "163":
self.Url = "http://snapshot.news.163.com/wgethtml/http+!!news.163.com!special!0001220O!news_json.js/"+self.strYear+"-"+self.strMonth+"/"+self.strDay+"/0.js"
self.getNewsFromRollPage(site)
elif site == "SINA":
self.Url = "http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=89&spec=&type=&date="+self.strYear+"-"+self.strMonth+"-"+self.strDay+"&k=&offset_page=0&offset_num=0&num=30000&asc=&page=1"
self.getNewsFromRollPage(site)
elif site == "IFENG":
pageNum = 1
while pageNum < 1000:#set 1000,the pageNum's max value
self.Url = "http://news.ifeng.com/rt-channel/rtlist_"+self.strYear+self.strMonth+self.strDay+"/"+str(pageNum)+".shtml"
if self.getNewsFromRollPage(site) == False:
break
pageNum += 1
print "Get "+ str(self.tag) + " " + site + " news successfully."
def getChinaNews(self):
try:
for tt in self.date_range:
print "Get News From Date:"+str(tt)
self.strYear = str(tt.year)
if tt.month < 10:
self.strMonth = "0" + str(tt.month)
else:
self.strMonth = str(tt.month)
if tt.day < 10:
self.strDay = "0" + str(tt.day)
else:
self.strDay = str(tt.day)
#create folder for the date tt
self.dir_name = self.root_dir_name + "/"+self.strYear+self.strMonth+self.strDay+"/"
isExists = os.path.exists(self.dir_name)
if not isExists:
os.mkdir(self.dir_name)
for i in self.siteList:
if i in defaultSiteList:
if i == "RMW":
socket.setdefaulttimeout(50)
self.getNewsFromSite(i)
#获取的新闻按照时间排序并输出到newsList.txt文件中
self.newsList.sort(key=lambda d:d[2])
with open(newsListFilePath+'newslist.txt','wa') as newslistFile:
newslistFile.write('===========NEWS_LIST'+time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))+'============\n')
for i in self.newsList:
newslistFile.write(i[3]+"###"+i[0]+"###"+i[1]+"###"+str(i[2])+"\n")
except KeyboardInterrupt:
print "Stopped By User."
sys.exit(0)
except:
print "Somethin wrong when getting chinanews."
#"2013-11-01","2013-12-01",dirName='D:/testnews',timeLimit=20
#s:str_start_time,e:str_end_time,d:dirName,t:timeLimit
argST = defaultStartTime
argED = defaultEndTime
argT = defaultSockTimeLimit
args = sys.argv
if not os.path.exists(argD):
os.mkdir(argD)
if len(args) == 1:
#download:下今天的
argST = time.strftime('%Y-%m-%d')
argED = time.strftime('%Y-%m-%d',time.localtime(time.time()+86400))
else:
if len(args) == 2:
if args[1] != "-a" and args[1] != "-help":
print "commond error,please see -help"
sys.exit(0)
if args[1] == "-a":
#download -a:下最近一个月的
argED = time.strftime('%Y-%m-%d',time.localtime(time.time()+86400))
datetmp = argED.split('-')
timetmp = time.mktime(datetime.datetime(int(datetmp[0]),int(datetmp[1]),int(datetmp[2])).timetuple())
argST = time.strftime('%Y-%m-%d',time.localtime(timetmp-86400*30))#往前走30天
else:
#download -help:显示帮助#命令帮助信息显示
print "help info of this commond:\nno arg: get news of today\n-a: get recent 30 days' news\n-d 2013-11-01: get news from the day\n-d 2013-11-01 2013-11-02: get news from the begin day to end day\n-dx 2013-11-01 t: get t days' news from the day\n"
sys.exit(0)
elif len(args) == 3:
if args[1] != "-d":
print "commond error,please see -help"
sys.exit(0)
else:
if not re.match(r"[\d]{4}-[\d]{2}-[\d]{2}$",args[2]):
print args[2] + " is not a right date format."
sys.exit(0)
#download -d 2013-11-01:下指定日期的
argST = args[2]
datetmp = args[2].split('-')
timetmp = time.mktime(datetime.datetime(int(datetmp[0]),int(datetmp[1]),int(datetmp[2])).timetuple())
argED = time.strftime('%Y-%m-%d',time.localtime(timetmp+86400))
elif len(args) == 4:
if args[1] != "-d" and args[1] != "-dx":
print "commond error,please see -help"
sys.exit(0)
else:
if not re.match(r"[\d]{4}-[\d]{2}-[\d]{2}$",args[2]):
print args[2] + " is not a right date format."
sys.exit(0)
if args[1] == "-d":
if not re.match(r"[\d]{4}-[\d]{2}-[\d]{2}$",args[3]):
print args[3] + " is not a right date format."
sys.exit(0)
#download -d 2013-11-01 2013-11-02:下区间内的
argST = args[2]
datetmp = args[3].split('-')
timetmp = time.mktime(datetime.datetime(int(datetmp[0]),int(datetmp[1]),int(datetmp[2])).timetuple())
argED = time.strftime('%Y-%m-%d',time.localtime(timetmp+86400))
else:
if not re.match(r"[\d]*$",args[3]):
print args[3] + " is not a right number format."
sys.exit(0)
#download -dx 2013-11-01 t:下指定日期以后t天的
argST = args[2]
datetmp = args[2].split('-')
timetmp = time.mktime(datetime.datetime(int(datetmp[0]),int(datetmp[1]),int(datetmp[2])).timetuple())
argED = time.strftime('%Y-%m-%d',time.localtime(timetmp+86400*int(args[3])))
G = GetChinaNews(str_start_time=argST,str_end_time=argED,dirName=argD,siteList=defaultSiteList,timeLimit=argT)
G.getChinaNews()