проблема с парсингом страницы и html5lib
код для парсинга
import traceback
import logging
from collections import namedtuple
from datetime import date
import bs4
avito_url = 'https://www.avito.ru'
def parse_time(time_str):
tl = time_str.strip().split(' ')
if len(tl) == 2:
return True
elif len(tl) == 3:
if tl[1] == 'мая':
return True
return False
def get_img_link(img_tag):
if not img_tag:
return
assert isinstance(img_tag, bs4.element.Tag)
if img_tag.has_attr('data-srcpath'):
return img_tag['data-srcpath']
elif img_tag.has_attr('src'):
return img_tag['src']
for chld in img_tag.descendants:
if isinstance(chld, bs4.element.Tag):
res = get_img_link(chld)
if res is not None:
return res
SimpleAd = namedtuple('Ad', ['id', 'photo', 'title', 'price', 'loc', 'ubahn_dist', 'time', 'link'])
class Ad(SimpleAd):
def __eq__(self, other):
return self.id == other.id
def __hash__(self):
return hash(self.id)
def get_metro_distance(ad_location):
elements = ad_location.split(' ')
multiplier = 1
for i in range(len(elements)):
if elements[i].strip() in ['м,', 'км,']:
if elements[i].startswith('км'):
multiplier = 1000
distance_str = elements[i-1].replace(',', '.')
return float(distance_str) * multiplier
logging.warning("Couldn't get metro distance for %s", ad_location)
return 0
def parse_page(html_str):
result = set()
soup = bs4.BeautifulSoup(html_str, "html5lib")
cont = soup.find('div', "js-catalog_serp")
assert isinstance(cont, bs4.element.Tag)
for elem in cont.find_all('div', recursive=False):
assert isinstance(elem, bs4.element.Tag)
if 'avito-ads-container' in elem['class']:
continue
if 'item-popup-content' in elem['class']:
continue
if 'avito-ads-container' in elem['class']:
continue
if 'serp-vips' in elem['class']:
continue
try:
ad_id = elem.get('id')[1:]
ad_photo_link = get_img_link(elem.find('div', class_='item-slider-image'))
elem_title = elem.find('h3', 'snippet-title')
assert isinstance(elem_title, bs4.element.Tag)
ad_link = elem_title.find('a').get('href')
ad_title = str(elem_title.find('a').contents[0])
ad_price = elem.find('div', 'snippet-price-row').text.strip()
ad_location = elem.find('div', 'address').text.strip()
if 'м' not in ad_location and 'км' not in ad_location:
logging.warning("Can't parse address: %s', ad_location")
continue
ad_metro_dist = get_metro_distance(ad_location)
ad_time = elem.find('div', 'snippet-date-info').text.strip()
ad_time = ad_time.replace('Сегодня', str(date.today().day) + ' марта')
ad_time = ad_time.replace('Вчера', str(date.today().day - 1) + ' марта')
new_ad = Ad(
int(ad_id),
ad_photo_link,
ad_title.strip(),
ad_price,
ad_location,
ad_metro_dist,
ad_time.strip(),
avito_url + ad_link
)
result.add(new_ad)
except Exception:
logging.error(traceback.format_exc())
continue
return result
if __name__ == '__main__':
with open('page_sample.htm', encoding='utf8') as f:
dd = f.read()
for element in parse_page(dd):
logging.info(element)
pass
Ошибка взаимодействия
2024-04-23 21:49:14,369 - root - ERROR - Traceback (most recent call last):
File "C:\Users\Vova\Desktop\find-a-flat-bot-master\main.py", line 18, in main
new_ads = main_pars.get_new_ads(uid, target_url)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Vova\Desktop\find-a-flat-bot-master\main_pars.py", line 48, in get_new_ads
parse_res = get_ads(target_url)
^^^^^^^^^^^^^^^^^^^
File "C:\Users\Vova\Desktop\find-a-flat-bot-master\main_pars.py", line 26, in get_ads
parse_res = parse_page(page_content)
^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Vova\Desktop\find-a-flat-bot-master\parse_page.py", line 62, in parse_page
soup = bs4.BeautifulSoup(html_str, "html5lib")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Vova\AppData\Local\Programs\Python\Python311\Lib\site-packages\bs4\__init__.py", line 250, in __init__
raise FeatureNotFound(
bs4.FeatureNotFound: Couldn't find a tree builder with the features you requested: html5lib. Do you need to install a parser library?
При использовании
soup = bs4.BeautifulSoup(html_str, "html.parser")
выдает
DEBUG:telegram.ext.updater:Start network loop retry getting Updates
DEBUG:root:GET https://www.avito.ru/chelyabinsk/kvartiry/prodam-ASgBAgICAUSSA8YQ?cd=1&context=H4sIAAAAAAAA_0q0MrSqLraysFJKK8rPDUhMT1WyLrYysVLKTczMU7KuBQQAAP__w5qblCAAAAA
DEBUG:telegram.bot:Entering: get_updates
DEBUG:root:403
ERROR:root:Error: Expecting value: line 1 column 1 (char 0)
2024-04-23 21:46:21,422 - root - ERROR - Error: Expecting value: line 1 column 1 (char 0)
ERROR:root:Traceback (most recent call last):
File "C:\Users\Vova\Desktop\find-a-flat-bot-master\main.py", line 18, in main
new_ads = main_pars.get_new_ads(uid, target_url)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Vova\Desktop\find-a-flat-bot-master\main_pars.py", line 48, in get_new_ads
parse_res = get_ads(target_url)
^^^^^^^^^^^^^^^^^^^
File "C:\Users\Vova\Desktop\find-a-flat-bot-master\main_pars.py", line 28, in get_ads
paste_url = paste(page_content)
^^^^^^^^^^^^^^^^^^^
File "C:\Users\Vova\Desktop\find-a-flat-bot-master\main_pars.py", line 13, in paste
return 'https://hastebin.com/' + oo.json()['key']
^^^^^^^^^
File "C:\Users\Vova\AppData\Local\Programs\Python\Python311\Lib\site-packages\requests\models.py", line 897, in json
return complexjson.loads(self.text, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Vova\AppData\Local\Programs\Python\Python311\Lib\json\__init__.py", line 346, in loads
return _default_decoder.decode(s)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Vova\AppData\Local\Programs\Python\Python311\Lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Vova\AppData\Local\Programs\Python\Python311\Lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
main_pars.py
import os
import pickle
import logging
import requests
from parse_page import parse_page
from os import path
from collections import defaultdict
def paste(data):
oo = requests.post('https://hastebin.com/documents', data=data.encode('utf-8'))
return 'https://hastebin.com/' + oo.json()['key']
def get_page(target_url):
logging.debug("GET %s", target_url)
oo = requests.get(target_url)
logging.debug(oo.status_code)
return oo.text
def get_ads(target_url):
page_content = get_page(target_url)
try:
parse_res = parse_page(page_content)
except AssertionError:
paste_url = paste(page_content)
logging.error('Failed to parse page %s', paste_url)
raise
return parse_res
def get_new_ads(uid, target_url):
ads = defaultdict(set)
target_dir = 'scan_results'
if not path.exists(target_dir):
os.makedirs(target_dir)
dump_file_name = path.join(target_dir, 'scan.dump')
try:
with open(dump_file_name, 'rb') as f:
ads = pickle.load(f)
except (IOError, EOFError):
pass
parse_res = get_ads(target_url)
logging.debug("Total ads: %i", len(parse_res))
diff = parse_res.difference(ads[uid])
logging.debug("New ads: %i", len(diff))
if diff:
ads[uid].update(parse_res)
with open(dump_file_name, 'wb') as f:
pickle.dump(ads, f)
return diff
if __name__ == '__main__':
_ = paste('1231231231254')
get_new_ads(1, 'https://www.avito.ru/sankt-peterburg/kvartiry/sdam/na_dlitelnyy_srok-ASgBAgICAkSSA8gQ8AeQUg?cd=1&metro=194&f=ASgBAQICAkSSA8gQ8AeQUgFAzAgkkFmOWQ')
get_new_ads(9, 'https://www.avito.ru/sankt-peterburg/kvartiry/sdam/na_dlitelnyy_srok/1-komnatnye-ASgBAQICAkSSA8gQ8AeQUgFAzAgUjlk?cd=1&map=e30%3D&user=1&metro=194&f=ASgBAQICAkSSA8gQ8AeQUgJA6BYU6PwBzAgUjlk')
new_ads = get_new_ads(0, 'https://www.avito.ru/sankt-peterburg/kvartiry/sdam?cd=1&pmax=43000&pmin=0&metro=157-160-164-165-173-176-180-189-191-199-205-209-210-211-1016&f=568_14011b0.550_5702-5703-5704-5705-5706')
pass