Ошибка парсинге сайта
Короче есть код для парсинга сайта, первый раз он почти спарсил всё что нужно, но потом при повторных перезапусках в строке 42 он может выдавать ошибку "occupations" или в 74 "workers", а может выдать одну из них отпарсив уже несколько страниц, хотя первые разы такого не было, в чем проблема? Строки с ошибками пометил комментариями
import json
import time
import requests
from bs4 import BeautifulSoup
import os
from colorama import init, Fore, Back, Style
def fetch(url,params):
headers = params['headers']
body = params['body']
if params['method']=='GET':
return requests.get(url,headers=headers)
if params['method']=='POST':
return requests.post(url,headers=headers,data=body)
return requests
main_page = fetch("https://uslugi.yandex.ru/api/213-moscow/get_home_rubrics?lr=213&workersCount=true", {
"headers": {
"accept": "application/json, text/plain, */*",
"accept-language": "ru-RU,ru;q=0.9",
"content-type": "application/json;charset=UTF-8",
"sec-ch-ua": "\"Chromium\";v=\"102\", \"Opera GX\";v=\"88\", \";Not A Brand\";v=\"99\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"x-app-version": "2.2447.0-15f89d2af056750a15086038994cba0e5bfeea51.0",
"x-csrf-token": "da2f836cc33b86d110f223c0f134d10fd47f75e1:1656503032",
"x-expected-puid": "",
"x-requested-with": "XMLHttpRequest",
"x-retpath-y": "https://uslugi.yandex.ru/213-moscow/catalog?rubric=%2Fkrasota",
"cookie": "yandexuid=2585937781656503024; i=gWrntQ7lN3x3Uw5qP0Qz15+IdJQ5aqQRg67OJMWpzHPOsjaH3cLe4bx1+Vp/twBpHbZv9BdJW9noXS+kKVTl8HPt7l0=; _yasc=2w/cmb9EmlLja6ze+YvsuHHaSC1fsXOTMKq4gKynm/cOzsMs; yuidss=2585937781656503024; ymex=1971863025.yrts.1656503025; gdpr=0; _ym_uid=1656503024996753070; _ym_d=1656503025; _ym_visorc=b; spravka=dD0xNjU2NTAzMDMyO2k9OTUuMzEuMTY0LjE0NztEPTkzRTg4RUU2M0VBMTg2NjNFNTQzNTNCREIyNEUzMTVFNjU1QTVGMjI1NzU2NDNGRjMxRTk3NTE2NEYyN0Y4Q0VBNzBCQ0M2NDt1PTE2NTY1MDMwMzI4NjY4NzEzNjk7aD0zYThmYTliZWYzNTU1NDMzZjk0MmM0NDMyNjAwYTk5Mg==; _ym_isad=2",
"Referer": "https://uslugi.yandex.ru/213-moscow/catalog?rubric=%2Fkrasota",
"Referrer-Policy": "strict-origin-when-cross-origin",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.115 Safari/537.36 OPR/88.0.4412.65'
},
"body": "{\"data\":{\"params\":{}}}",
"method": "POST"
})
for i in range(58):
name_for_dir = main_page.json()['occupations'][8]['specializations'][i]['name'] # Ошибка occupations
os.mkdir(f'{name_for_dir}')
print('\t'+Fore.RED + main_page.json()['occupations'][8]['specializations'][i]['name']+Fore.RESET)
krasota_specializations_id = main_page.json()['occupations'][8]['specializations'][i]['numberId']
krasota_specializations_seoID = 'https://uslugi.yandex.ru/api/213-moscow/category' + main_page.json()['occupations'][8]['specializations'][i]['seoId']
for page in range(10):
print('\t'+Fore.RED+f'Page: {page} from 10'+Fore.RESET)
workers = fetch(f"{krasota_specializations_seoID}--{krasota_specializations_id}?p={page}", {
"headers": {
"accept": "application/json, text/plain, */*",
"accept-language": "ru-RU,ru;q=0.9",
"sec-ch-ua": "\"Chromium\";v=\"102\", \"Opera GX\";v=\"88\", \";Not A Brand\";v=\"99\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"x-app-version": "2.2447.0-15f89d2af056750a15086038994cba0e5bfeea51.0",
"x-csrf-token": "5d9ba3e6858087eb66d56266930eee265a79cf3d:1656503276",
"x-expected-puid": "",
"x-requested-with": "XMLHttpRequest",
"x-retpath-y": f"{krasota_specializations_seoID}--{krasota_specializations_id}?p={page}",
"x-uslugi-apitargeturl": f"{krasota_specializations_seoID}--{krasota_specializations_id}?p={page}",
"cookie": "yandexuid=2585937781656503024; i=gWrntQ7lN3x3Uw5qP0Qz15+IdJQ5aqQRg67OJMWpzHPOsjaH3cLe4bx1+Vp/twBpHbZv9BdJW9noXS+kKVTl8HPt7l0=; yuidss=2585937781656503024; ymex=1971863025.yrts.1656503025; gdpr=0; _ym_uid=1656503024996753070; _ym_d=1656503025; _ym_visorc=b; spravka=dD0xNjU2NTAzMDMyO2k9OTUuMzEuMTY0LjE0NztEPTkzRTg4RUU2M0VBMTg2NjNFNTQzNTNCREIyNEUzMTVFNjU1QTVGMjI1NzU2NDNGRjMxRTk3NTE2NEYyN0Y4Q0VBNzBCQ0M2NDt1PTE2NTY1MDMwMzI4NjY4NzEzNjk7aD0zYThmYTliZWYzNTU1NDMzZjk0MmM0NDMyNjAwYTk5Mg==; _ym_isad=2; is_gdpr=0; is_gdpr_b=CNzDcxC+ew==; _yasc=/BObO2mO50Khb3LNMjsrTAvUhIW7wjC4qHpExN/rupJa65vp",
"Referer": f"{krasota_specializations_seoID}--{krasota_specializations_id}?p={page}",
"Referrer-Policy": "strict-origin-when-cross-origin",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.115 Safari/537.36 OPR/88.0.4412.65'
},
"body": None,
"method": "GET"
})
print(f"{krasota_specializations_seoID}--{krasota_specializations_id}?p={page}")
persons = workers.json()['workers']['items'] # Ошибка workers
for person in persons:
person_name = persons[person]['personalInfo']['displayName']
try:
person_description = persons[person]['personalInfo']['description'].strip()
except:
person_description = ''
continue
person_socialLinks_appointments = persons[person]['personalInfo']['socialLinks']['appointments']
person_socialLinks_messengers = persons[person]['personalInfo']['socialLinks']['messengers']
person_socialLinks_other = persons[person]['personalInfo']['socialLinks']['other']
person_socialLinks_appointments_list = {}
person_socialLinks_messengers_list = {}
person_socialLinks_other_list = {}
for ln in person_socialLinks_appointments:
first = ln
second = person_socialLinks_appointments[ln]
person_socialLinks_appointments_list[first] = second
for ln in person_socialLinks_messengers:
first = ln
second = person_socialLinks_messengers[ln]
person_socialLinks_messengers_list[first]= second
for ln in person_socialLinks_other:
first = ln
second = person_socialLinks_other[ln]
person_socialLinks_messengers_list[first]= second
if person_socialLinks_appointments == {}:
person_socialLinks_appointments = ''
if person_socialLinks_messengers == {}:
person_socialLinks_messengers = ''
if person_socialLinks_other == {}:
person_socialLinks_other = ''
person_data = []
person_data.append(
{
'Name: ':person_name,
'Appointments: ':person_socialLinks_appointments_list,
'Messangers: ': person_socialLinks_messengers_list,
'Other Links: ': person_socialLinks_other_list,
'Description: ': person_description
}
)
print(Fore.YELLOW+'Name: '+Fore.RESET+f'{person_name}\n\n'+Fore.YELLOW+'Links:\n'+Fore.GREEN+'Appointments: '+Fore.RESET+f'{person_socialLinks_appointments}\n'+
Fore.GREEN+'Messengers: '+Fore.RESET+f'{person_socialLinks_messengers}\n'+Fore.GREEN+'Other: '+Fore.RESET+f'{person_socialLinks_other}\n\n'+Fore.YELLOW+'Description: '+
Fore.RESET+f'{person_description}\n')
print(Fore.RED+'----------------------'+Fore.RESET)
with open(f'{name_for_dir}/page_{page+1}.json','a',encoding='utf-8') as file:
json.dump(person_data,file,indent=4,ensure_ascii=False)