Jun-12-2019, 08:29 AM
Hi, I am having difficulty trying to scrap webscrapping links and then enter those links to scrape data.
The webscrapping links is done but to enter those links from the first time webscrapping and entering those links inside and then collect data is another difficulty.
I have attached my first tier codes but i cant figure out the second tier codes.
Appreciate any kind help.
Thanks.
The webscrapping links is done but to enter those links from the first time webscrapping and entering those links inside and then collect data is another difficulty.
I have attached my first tier codes but i cant figure out the second tier codes.
Appreciate any kind help.
Thanks.
from bs4 import BeautifulSoup
import requests
import pandas as pd
from urllib.request import urlopen,urlparse, Request,HTTPError
import urllib
import re
import numpy as np
import csv
from http.client import BadStatusLine
import ssl
import json
#from googlesearch import search
class Google:
@classmethod
def search1(self, search):
url_list = [] #store all the extracted urls in a List
title_list = [] #store all the extracted titles in a List
description_list = [] #store all the extracted Description in a List
all_links = []
for start in range(0,10):
#page = requests.get('https://www.google.com/search?rlz=1C1CHBF_enSG851SG851&ei=Nib2XI6FEcmLvQS1xb-wBQ&q=site%3Alinkedin.com+inurl%3Ain+%7C+inurl%3Apub+%7C+inurl%3Aprofile+-inurl%3Adir+-inurl%3Atitle+-inurl%3Agroups+-inurl%3Acompany+-inurl%3Ajobs+-inurl%3Ajobs2+VP&oq=site%3Alinkedin.com+inurl%3Ain+%7C+inurl%3Apub+%7C+inurl%3Aprofile+-inurl%3Adir+-inurl%3Atitle+-inurl%3Agroups+-inurl%3Acompany+-inurl%3Ajobs+-inurl%3Ajobs2'+search+str(start*10), verify = False)
page = requests.get('http://www.google.com/search?q='+search+str(start*10), verify = False, timeout=5)
#page = requests.get('https://www.google.com/search?q='+search, verify = True)
soup = BeautifulSoup(page.content, "lxml")
#soup = BeautifulSoup(page.content)
for link in soup.find_all("a",href=re.compile("(?<=/url\?q=)(htt.*://.*)")): #original working code
a = (re.split(":(?=http)",link["href"].replace("/url?q=","")))
a = a[0].split("&")[0]
url_list.append(a)
