Dec-19-2017, 05:36 AM
hi,
i am trying to webscrape this webpage:
https://maharerait.mahaonline.gov.in//Pr...JDSA%3d%3d
and the value i am trying to webscrape is the text PARAM DEVELOPERS
the code i have written in my original code is
i have attached the txt file even to run this code
i am trying to webscrape this webpage:
https://maharerait.mahaonline.gov.in//Pr...JDSA%3d%3d
and the value i am trying to webscrape is the text PARAM DEVELOPERS
the code i have written in my original code is
import urllib.request
import urllib
import json
import xml.etree.ElementTree as ET
import csv
from bs4 import BeautifulSoup
link = 'https://maharerait.mahaonline.gov.in/searchlist/searchlist'
talukaLink = "https://maharerait.mahaonline.gov.in/SearchList/GetTaluka"
distlink = "https://maharerait.mahaonline.gov.in/SearchList/GetDistrict"
prjLink = "https://maharerait.mahaonline.gov.in/SearchList/GetProjectName"
links = []
certificatedata = []
def parseJson(data):
parsed = json.loads(data)
return parsed
def writedata(alldata1, filename):
print(" >>>> FINAL PRINTING DATA >>>> ")
#import pdb; pdb.set_trace()
with open("./"+filename,'w') as csvfile:
csvfile = csv.writer(csvfile, delimiter=',')
#csvfile.writerow(titleRow)
csvfile.writerow("")
for i in range(0, len( alldata1 )):
#print(alldata1[i])
csvfile.writerow( alldata1[i] )
def processlinksforcert():
global links, certificatedata
print(">> Came in fetching certificates data >>> " )
for sublist in links:
certno = sublist[0]
link = sublist[1]
htmldata = getData(link, {})
soup = BeautifulSoup(htmldata, "html.parser")
divs = soup.find_all("div")
#if len(certificatedata)>20:
# break
for div in divs:
attr = div.attrs
if "id" in attr.keys() and "DivExp" in attr['id']:
table = div.find_all(class_="grid-wrap")
for more in table:
text = more.find_all("tr")[1:]#if header has any TH
for tds in text:
td = tds.find_all("td")[1]
rnumber = ""
for num in td:
rnumber = num
sublist = []
sublist.append(certno)
sublist.append(rnumber)
td1 = tds.find_all("td")[2]
project = ""
for prj in td1:
project = prj
sublist.append(project)
td2 = tds.find_all("td")[3]
others = ""
for oth in td2:
others = oth
sublist.append(others)
td3 = tds.find_all("td")[4]
area = ""
for ara in td3:
area = ara
sublist.append(area)
td4 = tds.find_all("td")[5]
add = ""
for address in td4:
add = address
sublist.append(add)
td5 = tds.find_all("td")[6]
cts = ""
for ctsn in td5:
cts = ctsn
sublist.append(cts)
td6 = tds.find_all("td")[7]
buildings = ""
for build in td6:
buildings = build
sublist.append(buildings)
td7 = tds.find_all("td")[8]
apartments = ""
for apart in td7:
apartments = apart
sublist.append(apartments)
td8 = tds.find_all("td")[9]
original = ""
for date in td8:
original = date
sublist.append(original)
td9 = tds.find_all("td")[10]
actual = ""
for adate in td9:
actual = adate
sublist.append(actual)
certificatedata.append(sublist)
org = div.find_all(class_="col-md-3 col-sm-3")[4]
count = 0
val = len(certificatedata)
sublist1 = certificatedata[val -1]
for div1 in org:
sublist1.append(div1.get_text)
certificatedata[val-1] = sublist1
count +=1
def formattext(text):
while text.find("\r\n")>=0:
text = text.replace("\r\n","")
while text.find(" ")>=0:
text = text.replace(" ","")
return text
def readlinksdata():
global links
f = open("./jsondata.txt", "r")
txt = f.read()
f.close()
links = json.loads(txt)
def main():
global alldata, certificatedata
#data = getData(url, {})
#getProjectsList()
#print("Before write the projects data to the file. Count >> "+str(len(alldata)))
#writedata(alldata, "data.csv")
readlinksdata()
data = processlinksforcert()
print("Before write the certificates data to the file. Count >> "+str(len(data)))
writedata( data, "certificate2.csv" )
def getData(url, values):
import requests
#import pdb; pdb.set_trace()
#req = requests.get(link)
print("url >> "+url)
req = requests.post(url, data=values, timeout=10)
text = req.text
req.close()
#print("hello world"+str(values))
return text
#getDataByReq()
main()and the error i am getting is this :Error:Traceback (most recent call last):
File "C:\Users\prince.bhatia\Desktop\maharera\Past_Details.py", line 156, in <module>
main()
File "C:\Users\prince.bhatia\Desktop\maharera\Past_Details.py", line 137, in main
data = processlinksforcert()
File "C:\Users\prince.bhatia\Desktop\maharera\Past_Details.py", line 108, in processlinksforcert
sublist1 = certificatedata[val -1]
IndexError: list index out of rangecan anyone tell me what i am doing wrong?i have attached the txt file even to run this code
Attached Files
