import requests
from bs4 import BeautifulSoup
import json
import csv
headers = {
'Accept': '*/*',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
with open("all_categories_dict.json") as file:
all_categories = json.load(file)
count = 0
for category_name, category_href in all_categories.items():
if count == 0:
rep = [',',' ','-']
for item in rep:
if item in category_name:
category_name = category_name.replace(item, '_')
req = requests.get(url = category_href, headers=headers)
src = req.text
with open(f'data/{count}_{category_name}.html', 'w', encoding='utf8') as file:
file.write(src)
with open(f'data/{count}_{category_name}.html', encoding='utf8') as file:
src = file.read()
soup = BeautifulSoup(src, 'lxml')
table_head = soup.find(class_='mzr-tc-group-table').find('tr').find_all('th')
product = table_head[0].text
calories = table_head[1].text
proteins = table_head[2].text
fats = table_head[3].text
carbohydrates = table_head[4].text
with open(f'data/{count}_{category_name}.csv', 'w', encoding='utf8') as file:
writer = csv.writer(file)
writer.writerow(
(
product,
calories,
proteins,
fats,
carbohydrates
)
)
prooduct_data=soup.find(class_='mzr-tc-group-table').find("tbody").find_all("tr")
for item in prooduct_data:
prooduct_tds = item.find_all("td")
title = prooduct_tds[0].find("a").text
print(title)
count+=1