есть скрипт который отправляет запрос на сайт, получает ответ, если добавились новые элементы на сайте, то начинает их искать перебором в цикле
Смотрите, сайт обновляется в рандомное время. Скрипт постоянно ищет новые элементы и выводит их в консоль, но при по индексном сравнении что-то сбивается и выводит уже старые элементы. Сверять именно так нужно для производительности. https://www.nl.go.kr/seoji/
-*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import StaleElementReferenceException
from googletrans import Translator
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from seleniumrequests import Firefox
from seleniumrequests.request import RequestMixin
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.ui import Select
import sys
from datetime import datetime
import gc
#import datetime
import time
import numpy
import pytz
from pytz import all_timezones
from pytz import country_timezones
import urllib.request
import os
import numpy as np
import pickle
import traceback
import urllib.request
import urllib
import re
import requests
import lxml
import cchardet
import requests
import aiohttp
import asyncio
titles = []
titles2 = []
cookies = {
'PCID': '77a75160-93ed-e108-b921-7852f64f2511-1634605438412',
'_ga': 'GA1.3.1244550291.1634605439',
'_INSIGHT_CK_1101': 'e0e2860fab3ab16ca978df95f6a3dc4c_14698|a8749426e3d8f0df71f0df95f6a3dc4c_14698:1644217287000',
'WMONID': 'NJ1rbZbMcQG',
'JSESSIONID': '"ZqCHwLVq080CsBs7lW8kMFTNnlpwLeAZyon8yo3E.NLSEOJIWAS1:nl_seoji_1"',
'JSESSIONID_NL_USER': '"EK6AyZukS0G1MuJ4T2T3W9PUOCI_X8OLrFRjub2r.NLUWAS1:nl_main_1"',
}
headers = {
'Connection': 'keep-alive',
'sec-ch-ua': '";Not A Brand";v="99", "Chromium";v="94"',
'Accept': '*/*',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?0',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36',
'sec-ch-ua-platform': '"Linux"',
'Origin': 'https://nl.go.kr',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://nl.go.kr/seoji/contents/S80100000000.do',
'Accept-Language': 'en-US,en;q=0.9,ru;q=0.8',
# Requests sorts cookies= alphabetically
# 'Cookie': 'PCID=77a75160-93ed-e108-b921-7852f64f2511-1634605438412; _ga=GA1.3.1244550291.1634605439; _INSIGHT_CK_1101=e0e2860fab3ab16ca978df95f6a3dc4c_14698|a8749426e3d8f0df71f0df95f6a3dc4c_14698:1644217287000; WMONID=NJ1rbZbMcQG; JSESSIONID="ZqCHwLVq080CsBs7lW8kMFTNnlpwLeAZyon8yo3E.NLSEOJIWAS1:nl_seoji_1"; JSESSIONID_NL_USER="EK6AyZukS0G1MuJ4T2T3W9PUOCI_X8OLrFRjub2r.NLUWAS1:nl_main_1"',
}
data = {
'searchUrl': 'search?select=cip_id,rec_key,cip_key,form,set_expression,subject,series_no,ea_isbn,ea_add_code,ebook_yn,bib_yn,set_isbn,set_add_code,title,vol,author,publisher,series_title,edition_stmt,pre_price,publish_year,publish_predate,input_date,update_date,book_size,page,deposit_yn,real_publish_date,real_price,publisher_key,import_date,changed_date,title_url,kdc,ddc,publisher_url,book_introduction_url,book_summary_url,book_tb_cnt_url,control_no,cip_yn,index_series_title,index_title,index_author,index_publisher,related_isbn,form_detail,form_detail_version,kolis_control_no,kolis_img_path,book_introduction,book_tb_cnt,book_summary&from=cip.cip&where=text_idx%3D%22%EC%97%B0%EC%9E%AC%22%20allword%20and%20ebook_yn%3D%22Y%22%20and%20subject%3D%226%22%20order%20by%20publish_predate%20desc&offset=0&limit=100',
}
def Get_Start_Info():
SERVER = int(input("Номер сервера: "))
START_TIME_H = int(input("Через сколько часов: "))
START_TIME_H = START_TIME_H * 3600
START_TIME_M = int(input("Через сколько минут: "))
START_TIME_M = START_TIME_M * 60
START_SERVICES = int(input("Сеоджи(1)"))
START_TIME_END = START_TIME_H + START_TIME_M
return SERVER, START_TIME_END, START_SERVICES
bull = 0
h1 = 0
pop = 0
ooo = 0
response1 = requests.post('https://www.nl.go.kr/seoji/module/S80100000000_intgr_select_search_engine_data.ajax', headers=headers, cookies=cookies, data=data)
paux1 = np.array(response1.json()['result']['rows'])
sum_titles = response1.json()['result']['total_count']
deleter = 0
indexes = []
SIM = 0
async def Checker_num(srv, all_checker, checker):
global inf_tab_1
global inf_tab_2
global inf_tab_3
global cur_tab
global ooo
global h1
global SIM
global paux1
global sum_titles
global pop
global bull
global summa_title
global titles
global response1
global deleter
if checker == sum_titles: # Одинаковое ли кол-во элементов
deleter = deleter + 1
del all_checker
del checker
if deleter == 100: # чтобы память не засоряло
gc.collect()
deleter = 0
pass
else: # ТУТ ОШИБКА
paux2 = np.array(all_checker['result']['rows']) # перевод в np список для скорости
bull = 0
for i in range(99):
try: # (주)에브리웨이
if paux1[i - bull]['fields']['title'] != paux2[i]['fields']['title']:
bull = bull + 1
print(paux2[i]['fields']['title'], i)
except:
gc.collect()
pass
h1 = 0
paux1 = paux2
bull = 0
summa_title = 0
sum_titles = requests.post('https://www.nl.go.kr/seoji/module/S80100000000_intgr_select_search_engine_data.ajax', headers=headers, cookies=cookies, data=data)
sum_titles = sum_titles.json()['result']['total_count'] # сохраняет новое кол-во элементов
gc.collect()
#print("Скорее Всего Анонсов - ", l)
#print("Всего новых - ", summa_title)
async def get_pokemon(srv, session, url, headers, cookies, data): # отправляет кол-во элементов на проверку в Checker_num
async with session.post(url, headers=headers, cookies=cookies, data=data) as resp:
pokemon = await resp.json()
asyncio.ensure_future(Checker_num(srv ,pokemon ,pokemon['result']['total_count']))
return pokemon['result']['total_count']
async def main(srv): # делает запрос каждые 0.2 секуды и передаёт в get_pokemon
global sum_titles
async with aiohttp.ClientSession() as session:
tasks = []
while True:
url = 'https://www.nl.go.kr/seoji/module/S80100000000_intgr_select_search_engine_data.ajax'
tasks.append(asyncio.ensure_future(get_pokemon(srv, session, url, headers=headers, cookies=cookies, data=data)))
await asyncio.sleep(0.2)
original_pokemon = await asyncio.gather(*tasks)
try:
GLOBAL_INFO = Get_Start_Info()
SERVER_GLOBAL = GLOBAL_INFO[0]
TIME_START = GLOBAL_INFO[1]
#print("Сервер -",SERVER_GLOBAL, "| Через", TIME_START, "секунд")
asyncio.run(main(SERVER_GLOBAL))
except:
print('Ошибка:\n', traceback.format_exc())
finally:
driver.quit()