Hey everyone
i've got this python scraping code made for me that worked perfect few months but lately it miss some parameters while scraping.
you can see the missing cells starting from 94 row.
![[Image: image.png?__cld_token__=exp=1674835006~h...d42f23a1a9]](https://fiverr-res.cloudinary.com/image/upload/f_auto,q_auto/v1/secured-attachments/messaging_message/attachment/1991fc9dd15c5593624cf9403c49fee3-1674414016322/image.png?__cld_token__=exp=1674835006~hmac=da91313fdd7ddcdb00049e204d0bbc5a00855d8b2ddd3c7df5d007d42f23a1a9)
i'll be glad to get some help here with that. thanks :-)
i've got this python scraping code made for me that worked perfect few months but lately it miss some parameters while scraping.
you can see the missing cells starting from 94 row.
![[Image: image.png?__cld_token__=exp=1674835006~h...d42f23a1a9]](https://fiverr-res.cloudinary.com/image/upload/f_auto,q_auto/v1/secured-attachments/messaging_message/attachment/1991fc9dd15c5593624cf9403c49fee3-1674414016322/image.png?__cld_token__=exp=1674835006~hmac=da91313fdd7ddcdb00049e204d0bbc5a00855d8b2ddd3c7df5d007d42f23a1a9)
i'll be glad to get some help here with that. thanks :-)
# -*- coding: utf-8 -*-
"""ad.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/17lhluDBQi5LVjkQ8kPXNezFnqpEvU17P
"""
import requests
import time
import lxml.html
from bs4 import BeautifulSoup as soup
import pandas as pd
start_url = "https://www.ad.co.il/car?rp264=2022,2022&rp270=5000,120000&rp271=5000,5000&pageindex={}"
base_url = "https://www.ad.co.il"
START, END = (1,8)
def get_gallery_image(tree):
img = tree.cssselect("img.main-gallery-image.justify-content-center")
if img:
img_link=img[0].attrib.get('src')
return f"https:{img_link}"
else:
print ("Image not found")
def get_price_metadata(tree):
cards = tree.cssselect("div.d-flex.justify-content-between h2.card-title")
metadata = [x.text_content().strip() for x in cards]
metadata.reverse()
if(len(metadata)>1):
return metadata[0], metadata[1]
elif(len(metadata)==1):
return metadata[0],""
else:
return "",""
def get_contact_info(Soup):
contact_name = ""
contact_num = ""
scripts=Soup.find("script",attrs={"type":"application/ld+json"})
if(scripts):
data1=scripts.string
#print(data1)
data1=json.loads(data1)
try:
offers=data1["offers"]
except:
return contact_name, contact_num
seller=offers["seller"]
try:
contact_name=seller["name"]
except:
contact_name=""
try:
contact_num=seller["contactPoint"]["telephone"]
except:
contact_num=""
return contact_name, contact_num
def scrap_tables(Soup):
html = Soup.findAll("table")
first_table = None
second_table = None
try:
first_table = html[0]
except IndexError:
print ("No table found.")
return first_table, second_table
def create_df_row(link, gallary_image, price_tag, price, contact_tag, contact_num, first_table, second_table,AgencyCar,dic):
dictionary = {
"link": link,
"img": gallary_image,
"car_model": price,
"price": price_tag,
"contact_name": contact_tag,
"contact_num": contact_num,
"newListing":True,
"isItAvailable":True,
"agency car":AgencyCar
}
for d in dic:
dictionary[d]=dic[d]
if first_table is not None:
trs=first_table.findAll("tr")
for tr in trs:
tds=tr.findAll('td')
#print(tds[0],"||",tds[1])
#print(tds[0].text.strip(),"||",tds[1].text.strip())
dictionary[tds[0].text.strip()]=tds[1].text.strip()
if second_table is not None:
trs=second_table.findAll("tr")
for tr in trs:
tds=tr.findAll('td')
dictionary[tds[0].text.strip()]=tds[1].text.strip()
return dictionary
df=pd.read_excel("output.xlsx")
final=df.to_dict('records')
df["newListing"]=['']*df.shape[0]
df["isItAvailable"]=[False]*df.shape[0]
print(df.shape)
links=list(df["link"])
#df = pd.DataFrame()
import json
for i in range(START, 100):
temp_url = start_url.format(i)
print(temp_url)
response = requests.get(temp_url)
if response.status_code == 200:
html = response.text
Soup1=soup(html,"html.parser")
tree =lxml.html.fromstring(html)
cars = tree.cssselect("div.card-body.p-md-3 a")
car_links = [x.attrib for x in cars]
car_links=Soup1.find("div",attrs={"id":"cards"}).findAll("div",attrs={"class":"card overflow-hidden"})
print(len(car_links))
for car_link1 in car_links:
car_link = car_link1.find("a").get("href")
if not car_link:
print ("No car link found!")
continue
print (f"{base_url}{car_link}")
car_link =f"{base_url}{car_link}"
#print(car_link)
if(car_link in links):
print("already")
ind=links.index(car_link)
try:
oldprice=df["price"][ind].strip()
except:
oldprice=''
oldprice1=df["oldPrice"][ind]
curprice=car_link1.find("div",attrs={"class":"price"}).text.strip()
if(oldprice != curprice):
if(pd.isna(oldprice1)):
df["oldPrice"][ind]=oldprice+" - "+curprice
else:
df["oldPrice"][ind]=oldprice1+" - "+curprice
df["price"][ind]=curprice
df["newListing"][ind]=''
df["isItAvailable"][ind]=''
continue
car_res = requests.get(car_link)
car_file_name = car_link.split("/")[-1]
car_html = car_res.text
Soup=soup(car_html,'html.parser')
sub_tree = lxml.html.fromstring(car_html)
gallary_image = get_gallery_image(sub_tree)
price_tag, price = get_price_metadata(sub_tree)
contact_tag, contact_num = get_contact_info(Soup)
first_table, second_table = scrap_tables(Soup)
div=Soup.find("div",attrs={"class":"px-3 text-primary font-weight-bold"})
if(div):
AgencyCar="yes"
else:
AgencyCar="no"
dic={}
p=Soup.find("p",attrs={"class":"text-word-break"})
if(p):
dic["Description"]=p.text.strip()
px3s=Soup.findAll("div",attrs={"class":"px-3"})
for px in px3s:
try:
key=px.text.strip().split(":")[0].strip()
val=px.text.strip().split(":")[1].strip()
dic[key]=val
except:
pass
p=Soup.find("div",attrs={"class":"d-inline-flex text-end fs--1"})
if(p):
try:
val=p.find("a").text.strip()
key=p.text.strip().replace(val,"")
dic[key]=val
except:
pass
row = create_df_row(car_link,gallary_image, price_tag, price, contact_tag, contact_num, first_table, second_table,AgencyCar,dic)
df = df.append(row,ignore_index=True)
if(len(car_links)<48):
break
else:
print (f"invalid status: {response.status_code} for {temp_url}")
df = df.fillna("")
import datetime
date = str(datetime.datetime.now())
date = date.split(".")[0]
df.to_excel("output.xlsx",index=False)
buran write Jan-27-2023, 10:02 AM:
Please, when post code use proper BBCode tags - e.g.
Please, when post code use proper BBCode tags - e.g.
python, not quote