visit
The client provided PDF attachment under the job posting which describes the steps clearly. Here is the description of task:I need help setting up a web scraper that gets data from a website. What I need it to do is:
virtualenv env
. env/bin/activate
pip install selenium xlwt
As you know Selenium is a web automation tool and we are going to use it to navigate target pages and get data from there. xlwt is a library to generate spreadsheet files compatible with Microsoft Excel and the package itself is pure Python with no dependencies on modules or packages outside the standard Python distribution.
I know some of you will tell me to use pandas but let's just keep xlwt for this project.from selenium import webdriver
class Bolagsverket:
def __init__(self):
# set your driver path here
self.bot = webdriver.Firefox(executable_path='/path/to/geckodriver')
Now, we are creating new function named navigate_and_crawl:
import time
def navigate_and_crawl(self):
bot = self.bot
bot.get('//poit.bolagsverket.se/poit/PublikPoitIn.do')
time.sleep(5)
As you see I put sleep() function right after navigating to URL to act like human delay.
Enter the website and navigate to "Sök kungörelse"
def navigate_and_crawl(self):
bot = self.bot
bot.get('//poit.bolagsverket.se/poit/PublikPoitIn.do')
time.sleep(5)
bot.find_element_by_id('nav1-2').click()
time.sleep(5)
def navigate_and_crawl(self):
bot = self.bot
bot.get('//poit.bolagsverket.se/poit/PublikPoitIn.do')
time.sleep(5)
bot.find_element_by_id('nav1-2').click()
time.sleep(5)
bot.find_element_by_tag_name('form').find_element_by_tag_name('a').click()
time.sleep(5)
- Tidsperiod > Annan period > Input date-interval for the past day
search_form = bot.find_element_by_tag_name('form')
search_form.find_element_by_xpath(f"//select[@id='tidsperiod']/option[text()='Annan period']").click()
import datetime
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
def navigate_and_crawl(self):
bot = self.bot
bot.get('//poit.bolagsverket.se/poit/PublikPoitIn.do')
time.sleep(5)
bot.find_element_by_id('nav1-2').click()
time.sleep(5)
bot.find_element_by_tag_name('form').find_element_by_tag_name('a').click()
time.sleep(5)
search_form = bot.find_element_by_tag_name('form')
search_form.find_element_by_xpath(f"//select[@id='tidsperiod']/option[text()='Annan period']").click()
wait = WebDriverWait(bot, 10)
input_from = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@id='from']")))
#input_from.send_keys(str(datetime.date.today()-datetime.timedelta(1)))
input_from.send_keys('2019-09-23')
input_to = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@id='tom']")))
#input_to.send_keys(str(datetime.date.today()))
input_to.send_keys('2019-09-24')
time.sleep(3)
amnesomrade = wait.until(EC.element_to_be_clickable((By.XPATH, "//select[@id='amnesomrade']")))
amnesomrade.find_element_by_xpath(f"//select[@id='amnesomrade']/option[text()='Bolagsverkets registreringar']").click()
time.sleep(2)
kungorelserubrik = wait.until(EC.element_to_be_clickable((By.XPATH, "//select[@id='kungorelserubrik']")))
kungorelserubrik.find_element_by_xpath(f"//select[@id='kungorelserubrik']/option[text()='Aktiebolagsregistret']").click()
time.sleep(2)
underrubrik = wait.until(EC.element_to_be_clickable((By.XPATH, "//select[@id='underrubrik']")))
underrubrik.find_element_by_xpath(f"//select[@id='underrubrik']/option[text()='Nyregistreringar']").click()
time.sleep(2)
# Search Button
button_sok = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@id='SokKungorelse']")))
button_sok.click()
time.sleep(5)
Let's start by finding number of all pages and number of results for each page
If you look first red circle in picture above you can see number of last page which means we have 18 pages in total.# find number of pages and extract the string after "av"
number_of_pages = bot.find_element_by_xpath('//div[@class="gotopagediv"]/em[@class="gotopagebuttons"]').text.split("av",1)[1]
# remove any empty spaces
number_of_pages.strip().replace(" ", "")
# all results or links for each page
number_of_results = bot.find_elements_by_xpath('//table/tbody/tr')
wb = Workbook()
for page in range(int(number_of_pages)):
# Create new sheet for each page
sheet = wb.add_sheet('Sheet ' + str(page))
style = xlwt.easyxf('font: bold 1')
sheet.write(0, 0, 'Post Address', style)
sheet.write(0, 1, 'Bildat', style)
sheet.write(0, 2, 'Företagsnamn', style)
sheet.write(0, 3, 'Email', style)
# Click each link in results
for i in range(1, len(number_of_results) + 1):
result = bot.find_elements_by_xpath('//table/tbody/tr')[i]
link = result.find_element_by_tag_name('a')
bot.execute_script("arguments[0].click();", link)
time.sleep(2)
wb = Workbook()
for page in range(int(number_of_pages)):
sheet = wb.add_sheet('Sheet ' + str(page), cell_overwrite_ok=True)
style = xlwt.easyxf('font: bold 1')
sheet.write(0, 0, 'Post Address', style)
sheet.write(0, 1, 'Bildat', style)
sheet.write(0, 2, 'Företagsnamn', style)
sheet.write(0, 3, 'Email', style)
for i in range(len(number_of_results)):
result = bot.find_elements_by_xpath('//table/tbody/tr')[i]
link = result.find_element_by_tag_name('a')
bot.execute_script("arguments[0].click();", link)
time.sleep(2)
information = [bot.find_element_by_class_name('kungtext').text]
try:
postaddress = re.search('Postadress:(.*),', information[0])
sheet.write(i + 1, 0, str(postaddress.group(1)))
bildat = re.search('Bildat:(.*)\n', information[0])
sheet.write(i + 1, 1, str(bildat.group(1)))
foretagsnamn = re.search('Företagsnamn:(.*)\n', information[0])
sheet.write(i + 1, 2, str(foretagsnamn .group(1)))
email = re.search('E-post:(.*)\n', information[0])
sheet.write(i + 1, 3, str(email.group(1)))
print(postaddress.group(1),bildat.group(1),foretagsnamn.group(1),email.group(1))
except AttributeError as e:
print('error => Email is null')
sheet.write(i + 1, 3, 'null')
pass
bot.back()
time.sleep(5)
wb.save('emails.xls')
print('Going to next page')
button_next= wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@id='movenextTop']")))
button_next.click()
time.sleep(5)
Regex will extract the value between "Field Name" and "\n" newline in the paragraph. In some results email field is missing so I added try except to detect it and automatically set the field "null".
i + 1 is preventing overwrite the column names in excel cells. I highly recommend to check my YouTube channel for more detailed explanation.
When the data successfully crawled for a single page program is saving the data into sheet and moving to the next page.import time
import datetime
import re
import xlwt
from xlwt import Workbook
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
class Bolagsverket:
def __init__(self):
self.bot = webdriver.Firefox(executable_path='/home/coderasha/Desktop/geckodriver')
def navigate_and_crawl(self):
bot = self.bot
bot.get('//poit.bolagsverket.se/poit/PublikPoitIn.do')
time.sleep(5)
bot.find_element_by_id('nav1-2').click()
time.sleep(5)
bot.find_element_by_tag_name('form').find_element_by_tag_name('a').click()
time.sleep(5)
search_form = bot.find_element_by_tag_name('form')
search_form.find_element_by_xpath("//select[@id='tidsperiod']/option[text()='Annan period']").click()
wait = WebDriverWait(bot, 10)
input_from = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@id='from']")))
input_from.send_keys('2019-09-23')
# input_from.send_keys(str(datetime.date.today()-datetime.timedelta(1)))
input_to = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@id='tom']")))
input_to.send_keys('2019-09-24')
# input_to.send_keys(str(datetime.date.today()))
time.sleep(5)
amnesomrade = wait.until(EC.element_to_be_clickable((By.XPATH, "//select[@id='amnesomrade']")))
amnesomrade.find_element_by_xpath("//select[@id='amnesomrade']/option[text()='Bolagsverkets registreringar']").click()
time.sleep(5)
kungorelserubrik = wait.until(EC.element_to_be_clickable((By.XPATH, "//select[@id='kungorelserubrik']")))
kungorelserubrik.find_element_by_xpath("//select[@id='kungorelserubrik']/option[text()='Aktiebolagsregistret']").click()
time.sleep(5)
underrubrik = wait.until(EC.element_to_be_clickable((By.XPATH, "//select[@id='underrubrik']")))
underrubrik.find_element_by_xpath("//select[@id='underrubrik']/option[text()='Nyregistreringar']").click()
# Search Button
button_sok = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@id='SokKungorelse']")))
button_sok.click()
time.sleep(5)
number_of_pages = bot.find_element_by_xpath("//div[@class='gotopagediv']/em[@class='gotopagebuttons']").text.split("av", 1)[1]
number_of_pages.strip().replace(" ","")
number_of_results = bot.find_elements_by_xpath('//table/tbody/tr')
wb = Workbook()
for page in range(int(number_of_pages)):
sheet = wb.add_sheet('Sheet' + str(page))
style = xlwt.easyxf('font: bold 1')
sheet.write(0, 0, 'Post Address', style)
sheet.write(0, 1, 'Bildat', style)
sheet.write(0, 2, 'Foretagsnamn', style)
sheet.write(0, 3, 'Email', style)
for i in range(len(number_of_results)):
result = bot.find_elements_by_xpath("//table/tbody/tr")[i]
link = result.find_element_by_tag_name('a')
bot.execute_script('arguments[0].click();', link)
time.sleep(5)
information = [bot.find_element_by_class_name('kungtext').text]
try:
postaddress = re.search('Postadress:(.*),', information[0])
sheet.write(i + 1, 0, str(postaddress.group(1)))
bildat = re.search('Bildat:(.*)\n', information[0])
sheet.write(i + 1, 1, str(bildat.group(1)))
foretagsnamn = re.search('Företagsnamn:(.*)\n', information[0])
sheet.write(i + 1, 2, str(foretagsnamn.group(1)))
email = re.search('E-post:(.*)\n', information[0])
sheet.write(i + 1, 3, str(email.group(1)))
print(postaddress.group(1), bildat.group(1), foretagsnamn.group(1), email.group(1))
except AttributeError as e:
print('Email is null')
sheet.write(i + 1, 3, 'null')
pass
bot.back()
time.sleep(5)
wb.save('emails.xls')
print('Going to next page ...')
button_next = wait.until(EC.element_to_be_clickable((By.XPATH, "//input/[@id='movenextTop']")))
button_next.click()
time.sleep(5)
bot = Bolagsverket()
bot.navigate_and_crawl()
You can watch the video tutorial of this project in my
I hope you enjoyed and learned something from this post. Job is still open so you can send proposal to client from Upwork. Please check for more cool content like this.
Stay Connected!