Оценить качество кода, порекомендовать, что можно улучшить
можете оценить код, порекомендовать что можно улучшить и как Заранее спасибо! :)
Код берет с сайта данные и записывает их в файл pickle P.S. Пересоздаю объект класса из-за того, что selenuim требует перезапуска
import pickle
import logging
import time
import datetime
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.remote.remote_connection import LOGGER
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
LOGGER.setLevel(logging.WARNING)
class ParserHoyo():
def __init__(self):
self.url = 'https://genshin.hoyoverse.com'
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--window-size=1420,1080')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
self.driver = webdriver.Chrome(ChromeDriverManager().install(), options = chrome_options)
def load(self):
try:
with open('data/hoyoversNewsActually.pickle', 'rb') as f:
return pickle.load(f)['all-news']
except FileNotFoundError:
with open('data/hoyoversNewsActually.pickle', 'wb') as f:
pickle.dump({'all-news': {}}, f)
return {}
def dump(self, data):
old_data = self.load()
for d in data:
if d in old_data: continue
old_data[d] = data[d]
with open('data/hoyoversNewsActually.pickle', 'wb') as f:
pickle.dump({'all-news': old_data}, f)
def get_full_desc(self, object):
html = self.get_html(object['href'])
data = {}
soup = BeautifulSoup(html, 'html.parser')
soup = soup.find('div', class_='article cate')
data['title'] = soup.find('div', class_='article__title').text.replace('\n', '').replace('\xa0', ' ')
data['desc'] = ['\n' if sp.text.replace('\xa0', ' ') == ' ' else sp.text.replace('\xa0', ' ') if len(sp.find_all('a')) == 0 else f"[{sp.text}]({sp.find('a').get('href')})" for sp in soup.find_all('p') if (len(sp.find_all('img')) == 0)]
data['imgs'] = [sp.get('src') for sp in soup.find_all('img')[:2]]
return data
def get_desc(self, html):
soup = BeautifulSoup(html, 'html.parser')
soup = soup.find("body").find("ul", class_='news')
data = {}
for sp in soup.find_all('li')[:-1]:
data_news = {}
page = sp.find('a', class_ = 'news__title news__content ellipsis')
text = page.find('div', class_ = 'news__info')
data_news['title'] = text.find('h3').text
data_news['desc'] = text.find('p', class_ = 'news__summary').text
data_news['href'] = self.url + page.get('href')
data_news['category'] = sp.find('a', class_ = 'news__category').text
data_news['img'] = self.url + page.find('img').get('src')
data_news['date'] = datetime.datetime.strptime(sp.find('div', class_ = 'news__date').text.replace(',', ''), '%b %d %Y')
data_news['full_desc'] = self.get_full_desc(data_news)
data[int(page.get('href').split('/')[-1])] = data_news
return data
def get_html(self, url):
self.driver.get(url)
WebDriverWait(self.driver, 300).until(lambda driver: BeautifulSoup(driver.page_source, 'lxml').find_all('div', class_='header__navwrap')) != 0
return self.driver.page_source
def parse(self):
html = self.get_html(self.url+'/en/news')
data = self.get_desc(html)
self.driver.close()
self.dump(data)
if __name__ == '__main__':
while 1:
try:
client = ParserHoyo()
client.parse()
print(f'Последнее сохранение - {datetime.datetime.now()}')
time.sleep(60*30)
except Exception as err:
print(err)