import requests
import csv
from bs4 import BeautifulSoup
import fake_useragent
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def hh_parse(base_url, headers):
"""
Parses job listings from hh.ru for the search query 'python'.
param base_url: Base URL for job search.
:type base_url: str
param headers: HTTP headers for request.
:type headers: dict
:return: List of job listings.
:rtype: list
"""
# Initialize variables
jobs = [ ]
urls = [ base_url ]
session = requests.Session()
user_agent = fake_useragent.UserAgent().random
headers [ 'User-Agent' ] = user_agent
# Send initial request to get number of pages
request = session.get(base_url, headers=headers)
if request.status_code == 200:
soup = BeautifulSoup(request.content, 'lxml')
try:
pagination = soup.find_all('a', attrs={'data-qa': 'pager-page'})
count_pages = int(pagination [ -1 ].text)
for i in range(count_pages):
url = f'https://hh.ru/search/vacancy?area=1&search_period=3&text=python&page={i}'
if url not in urls:
urls.append(url)
except:
logger.error("Error getting number of pages.")
return jobs
# Loop through each page and parse job listings
for url in urls [ :-1 ]:
logger.info("Parsing page: %s", url)
request = session.get(url, headers=headers)
soup = BeautifulSoup(request.content, 'lxml')
divs = soup.find_all('div', attrs={'class': 'vacancy-serp-item-body'})
for div in divs:
try:
title = div.find('a', attrs={'data-qa': 'serp-item__title'}).text
href = div.find('a', attrs={'data-qa': 'vacancy-serp__vacancy-title'}) [ 'href' ]
jobs.append({
'title': title,
'href': href
})
except:
logger.error("Error parsing job listing.")
pass
logger.info("Job listings parsed: %s", len(jobs))
else:
logger.error("Error sending request to hh.ru.")
return jobs
def write_to_csv(jobs):
"""
Writes job listings to a CSV file.
param jobs: List of job listings.
:type jobs: list
"""
with open('parsed_jobs.csv', 'w', newline='') as file:
writer = csv.writer(file, delimiter=',')
for job in jobs:
try:
writer.writerow((job [ 'title' ], job [ 'href' ]))
except:
logger.error("Error writing job listing to CSV.")
logger.info("Job listings written to CSV file.")
# Define base URL and headers for request
base_url = 'https://hh.ru/search/vacancy?area=1&search_period=3&text=python&page=0'
headers = {'User-Agent': ''}
# Parse job listings and write to CSV file
jobs = hh_parse(base_url, headers)
write_to_csv(jobs)