import requests
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.service import Service
import random
from selenium import webdriver
import time
def get_source(url):
s = Service(r"C:\Users\rakal\PycharmProjects\skraping_bot_for_momy\chromedriver\chromedriver.exe")
driver = webdriver.Chrome(service=s)
driver.maximize_window()
try:
driver.get(url=url)
time.sleep(3)
with open("source_page.html", 'w', encoding="utf-8") as file:
file.write(driver.page_source)
except Exception as _ex:
print(_ex)
finally:
driver.close()
driver.quit()
def get_items_urls(file_path):
with open(file_path) as file:
src = file.read()
soup = BeautifulSoup(src, "lxml", encoding="utf-8")
items_divs = soup.find_all("div", class_="aga2")
urls = []
for item in items_divs:
item_url = item.find("a").get("href")
urls.append(item_url)
with open("items_urls.txt", 'w', encoding="utf-8") as file:
for url in urls:
file.write(f"{url}\n")
return '[INFO] Urls collected!'
def main():
#get_source(url="https://www.ozon.ru/category/podguzniki-i-trusiki-30749/?deny_category_prediction=true&from_global=true&isdiscount=t&text=подгузники")
get_items_urls(file_path=r"C:\Users\rakal\PycharmProjects\skraping_bot_for_momy\source_page.html")
if __name__ == '__main__':
main()
Traceback (most recent call last):
File "C:\Users\rakal\PycharmProjects\skraping_bot_for_momy\main.py", line 49, in <module>
main()
File "C:\Users\rakal\PycharmProjects\skraping_bot_for_momy\main.py", line 45, in main
get_items_urls(file_path=r"C:\Users\rakal\PycharmProjects\skraping_bot_for_momy\source_page.html")
File "C:\Users\rakal\PycharmProjects\skraping_bot_for_momy\main.py", line 29, in get_items_urls
src = file.read()
File "C:\Users\rakal\AppData\Local\Programs\Python\Python310\lib\encodings\cp1251.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x98 in position 276732: character maps to <undefined>