scraping code misses listings

kolarmi19 · (This post was last modified: Jan-27-2023, 10:02 AM by buran.)

Hey everyone
i've got this python scraping code made for me that worked perfect few months but lately it miss some parameters while scraping.
you can see the missing cells starting from 94 row.

[Image: image.png?__cld_token__=exp=1674835006~h...d42f23a1a9]

[Image: image.png?__cld_token__=exp=1674835006~h...d42f23a1a9]

i'll be glad to get some help here with that. thanks :-)

# -*- coding: utf-8 -*-
"""ad.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/17lhluDBQi5LVjkQ8kPXNezFnqpEvU17P
"""



import requests
import time
import lxml.html
from bs4 import BeautifulSoup as soup
import pandas as pd

start_url = "https://www.ad.co.il/car?rp264=2022,2022&rp270=5000,120000&rp271=5000,5000&pageindex={}"
base_url = "https://www.ad.co.il"
START, END = (1,8)

def get_gallery_image(tree):
    img = tree.cssselect("img.main-gallery-image.justify-content-center")
    if img:
        img_link=img[0].attrib.get('src')
        return f"https:{img_link}"
    else:
        print ("Image not found")

def get_price_metadata(tree):
    cards = tree.cssselect("div.d-flex.justify-content-between h2.card-title")
    metadata = [x.text_content().strip() for x in cards]
    metadata.reverse()
    if(len(metadata)>1):
      return metadata[0], metadata[1]
    elif(len(metadata)==1):
      return metadata[0],""
    else:
      return "",""

def get_contact_info(Soup):
    contact_name = ""
    contact_num = ""
    scripts=Soup.find("script",attrs={"type":"application/ld+json"})
    if(scripts):
      data1=scripts.string
      #print(data1)
      data1=json.loads(data1)
      try:
        offers=data1["offers"]
      except:
        return contact_name, contact_num
      seller=offers["seller"]
      try:
        contact_name=seller["name"]
      except:
        contact_name=""
      try:
        contact_num=seller["contactPoint"]["telephone"]
      except:
        contact_num=""
    return contact_name, contact_num

def scrap_tables(Soup):
    html = Soup.findAll("table")    
    first_table = None
    second_table = None
    try:
        first_table = html[0]
    except IndexError:
        print ("No table found.")
    return first_table, second_table

def create_df_row(link, gallary_image, price_tag, price, contact_tag, contact_num, first_table, second_table,AgencyCar,dic):
    dictionary = {
        "link": link,
        "img": gallary_image,
        "car_model": price,
        "price": price_tag,
        "contact_name": contact_tag,
        "contact_num": contact_num,
        "newListing":True,
        "isItAvailable":True,
        "agency car":AgencyCar
    }
    for d in dic:
      dictionary[d]=dic[d]
    if first_table is not None:
        trs=first_table.findAll("tr")
        for tr in trs:
          tds=tr.findAll('td')
          #print(tds[0],"||",tds[1])
          #print(tds[0].text.strip(),"||",tds[1].text.strip())
          dictionary[tds[0].text.strip()]=tds[1].text.strip()
    if second_table is not None:
        trs=second_table.findAll("tr")
        for tr in trs:
          tds=tr.findAll('td')
          dictionary[tds[0].text.strip()]=tds[1].text.strip()
    return dictionary

df=pd.read_excel("output.xlsx")
final=df.to_dict('records')

df["newListing"]=['']*df.shape[0]
df["isItAvailable"]=[False]*df.shape[0]

print(df.shape)

links=list(df["link"])

#df = pd.DataFrame()
import json
for i in range(START, 100):
    temp_url = start_url.format(i)
    print(temp_url)
    response = requests.get(temp_url)
    if response.status_code == 200:
        html = response.text
        Soup1=soup(html,"html.parser")
        tree =lxml.html.fromstring(html)
        cars = tree.cssselect("div.card-body.p-md-3 a")
        car_links = [x.attrib for x in cars]
        car_links=Soup1.find("div",attrs={"id":"cards"}).findAll("div",attrs={"class":"card overflow-hidden"})
        print(len(car_links))
        for car_link1 in car_links:
            car_link = car_link1.find("a").get("href")
            if not car_link:
                print ("No car link found!")
                continue
            print (f"{base_url}{car_link}")
            car_link =f"{base_url}{car_link}"
            #print(car_link)
            if(car_link in links):
              print("already")
              ind=links.index(car_link)
              try:
                oldprice=df["price"][ind].strip()
              except:
                oldprice=''
              oldprice1=df["oldPrice"][ind]
              curprice=car_link1.find("div",attrs={"class":"price"}).text.strip()
              if(oldprice != curprice):
                if(pd.isna(oldprice1)):
                  df["oldPrice"][ind]=oldprice+" - "+curprice
                else:
                  df["oldPrice"][ind]=oldprice1+" - "+curprice
                df["price"][ind]=curprice
              df["newListing"][ind]=''
              df["isItAvailable"][ind]=''
              continue
            car_res = requests.get(car_link)
            car_file_name = car_link.split("/")[-1]
            car_html = car_res.text
            Soup=soup(car_html,'html.parser')
            sub_tree = lxml.html.fromstring(car_html)
            gallary_image = get_gallery_image(sub_tree)
            price_tag, price = get_price_metadata(sub_tree)
            contact_tag, contact_num = get_contact_info(Soup)
            first_table, second_table = scrap_tables(Soup)
            div=Soup.find("div",attrs={"class":"px-3 text-primary font-weight-bold"})
            if(div):
              AgencyCar="yes"
            else:
              AgencyCar="no"
            dic={}
            p=Soup.find("p",attrs={"class":"text-word-break"})
            if(p):
              dic["Description"]=p.text.strip()
            px3s=Soup.findAll("div",attrs={"class":"px-3"})
            for px in px3s:
              try:
                key=px.text.strip().split(":")[0].strip()
                val=px.text.strip().split(":")[1].strip()
                dic[key]=val
              except:
                pass
            p=Soup.find("div",attrs={"class":"d-inline-flex text-end fs--1"})
            if(p):
              try:
                val=p.find("a").text.strip()
                key=p.text.strip().replace(val,"")
                dic[key]=val
              except:
                pass
            row = create_df_row(car_link,gallary_image, price_tag, price, contact_tag, contact_num, first_table, second_table,AgencyCar,dic)
            df = df.append(row,ignore_index=True)
        if(len(car_links)<48):
            break
        
        
    else:
        print (f"invalid status: {response.status_code} for {temp_url}")

df = df.fillna("")

import datetime
date = str(datetime.datetime.now())
date = date.split(".")[0]
df.to_excel("output.xlsx",index=False)

buran write Jan-27-2023, 10:02 AM:
Please, when post code use proper BBCode tags - e.g. python, not quote

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	Code Help, web scraping non uniform lists(ul)	luke_m	4	4,861	Apr-22-2021, 05:16 PM Last Post: luke_m
	scraping code	nexuz89	0	2,310	Sep-28-2020, 12:16 PM Last Post: nexuz89
	In need of web scraping code!	kolbyng	1	2,743	Sep-21-2020, 06:02 AM Last Post: buran
	error in code web scraping	alexisbrunaux	5	5,993	Aug-19-2020, 02:31 AM Last Post: alexisbrunaux
	scraping from a website that hides source code	PIWI_Protein	1	3,328	Mar-27-2020, 05:08 PM Last Post: Larz60+

scraping code misses listings

User Panel Messages

Announcements