Парсинг динамического сайта Python

Question

мне нужно спарсить с этой страницы https://5ka.ru/special_offers все скидки, однако некоторые товары подгружаются динамически, после нажатия на кнопку "Загрузить еще". Возможно ли их получить?

import fake_useragent
import requests
from bs4 import BeautifulSoup


def get_discounts():
user = fake_useragent.UserAgent().random
headers = {
    "User-Agent": user,
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"

}

url = "https://5ka.ru/special_offers/"

req = requests.get(url=url, headers=headers)
soup = BeautifulSoup(req.text, 'lxml')

articles = soup.find("div", class_="items-list").find_all("div", class_="product-card item")

for article in articles:
    article_price_with_discount = article.find("div", class_="price-discount").find("span").text
    article_price_with_out_discount = article.find("div", class_="price-discount").find("span", class_="price-regular").text
    article_name = article.find("div", class_="item-name").text
    article_date = article.find("div", class_="item-date").text

    print(article_name.strip())
    print(article_price_with_discount)
    print(article_price_with_out_discount.strip())
    print(article_date.strip())


def main():
get_discounts()


if __name__ == '__main__':
main()

Answer 1

Для решения этой задачи вам нужно будет использовать selenium, и после сбора данных с 1 страницы нажимать на элемент пагинации

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException

def get_discounts():

    options = Options()
    options.add_argument('--headless')

    driver = webdriver.Firefox(options=options)
    driver.implicitly_wait(10)
    **дальше подключаете ваш соуп**
    for article in articles:
    **сюда пишите скрипт для сбора данных**
        
        #переход по элементу пагинации в том же цикле
        try:
            a_next_page = driver.find_element_by_class_name('more-btn-cont')
            a_next_page.click()

        except NoSuchElementException:
            break

Если Вы хотите увидеть что происходит в браузере, просто убирайте параметр --headless

Так же я в вашем коде немного бы подкорректировал print данных, примерно вот так:

for article in articles:

    article_price_with_discount = article.find("div", class_="price-discount").find("span").text
    article_price_with_out_discount = article.find("div", class_="price-discount").find("span", class_="price-regular").text
    article_name = article.find("div", class_="item-name").text.strip()
    article_date = article.find("div", class_="item-date").text.strip()
    
    row = article_price_with_discount, 
          article_price_with_out_discount, 
          article_name, article_date

   print(row)

БЛОГ НА HUSL

Парсинг динамического сайта Python

Ответы (1 шт):