Парсинг, обход cloudflare
Нужна помощь с написанием парсера для webmd, цель — спарсить список докторов, всю информацию о докторе (больше 100к страниц). Парсер настроен и готов к работе, но столкнулся с защитой Cloudflare с капчей, которая сбивает с толку.
Пробовал прокси, настройки юзер-агентов, даже session pooling, но каждый раз утыкаюсь в капчу. Есть идеи, как обойти эту защиту и пройти капчу автоматически? Буду благодарен за любые советы или готовые решения по этому вопросу.
P.S. Вопрос не про обход защиты ради обхода, а для автоматизации сбора данных.
import threading
from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from selenium import webdriver
from selenium.webdriver.chrome.service import Service # Импорт класса Service
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
from bs4 import BeautifulSoup
import time
driver_path = 'chromedriver.exe'
service = Service(driver_path)
driver = webdriver.Chrome(service=service)
def extract_urls_from_page(soup, target_zipcodes):
ul_element = soup.find("ul", class_="resultslist-content")
urls = []
if ul_element:
for list_item in ul_element.find_all("li"):
a_tag = list_item.find("a")
if a_tag:
url = a_tag.get("href")
response = requests.get(url)
if response.status_code == 200:
inner_soup = BeautifulSoup(response.content, 'html.parser')
zips_on_page = inner_soup.findAll('span', {'class': "location-zipcode loc-coi-loczip"})
for zip_span in zips_on_page:
if zip_span.text.strip() in target_zipcodes:
urls.append(url)
break
return urls
def extract_and_save_urls(base_url, db_url):
# Database setup for each thread
Base = declarative_base()
class SitesCrawled(Base):
__tablename__ = 'sites_crawled'
id = Column(Integer, primary_key=True)
site = Column(String(1024))
engine = create_engine(db_url)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
driver = webdriver.Chrome()
driver.maximize_window()
driver.get(base_url)
target_zipcodes = [
'00501', '06390', '11701', '11702', '11703', '11704', '11705', '11706',
'11707', '11708', '11713', '11715', '11716', '11717', '11718', '11719',
'11720', '11721', '11722', '11724', '11725', '11726', '11727', '11729',
'11730', '11731', '11733', '11734', '11738', '11739', '11740', '11741',
'11742', '11743', '11745', '11746', '11747', '11749', '11750', '11751',
'11752', '11754', '11755', '11757', '11760', '11763', '11764', '11766',
'11767', '11768', '11769', '11770', '11772', '11775', '11776', '11777',
'11778', '11779', '11780', '11782', '11784', '11786', '11787', '11788',
'11789', '11790', '11792', '11794', '11795', '11796', '11798', '11901',
'11930', '11931', '11932', '11933', '11934', '11935', '11937', '11939',
'11940', '11941', '11942', '11944', '11946', '11947', '11948', '11949',
'11950', '11951', '11952', '11953', '11954', '11955', '11956', '11957',
'11958', '11959', '11960', '11961', '11962', '11963', '11964', '11965',
'11967', '11968', '11969', '11970', '11971', '11972', '11973', '11975',
'11976', '11977', '11978', '11980'
]
try:
while True:
time.sleep(3)
# Close ads (if any)
soup = BeautifulSoup(driver.page_source, 'html.parser')
current_page_urls = extract_urls_from_page(soup, target_zipcodes)
for url in current_page_urls:
existing_entry = session.query(SitesCrawled).filter_by(site=url).first()
if not existing_entry:
entry = SitesCrawled(site=url)
session.add(entry)
session.commit()
try:
next_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, ".btn-next[title='Next Page']"))
)
driver.execute_script("arguments[0].click();", next_button)
except (NoSuchElementException, TimeoutException):
break
finally:
driver.quit()
session.close()
# Base URLs and Database URLs
base_urls = [
"https://doctor.webmd.com/providers/new-york/islip",
"https://doctor.webmd.com/providers/new-york/southold",
"https://doctor.webmd.com/providers/new-york/babylon",
"https://doctor.webmd.com/providers/new-york/brookhaven",
"https://doctor.webmd.com/providers/new-york/huntington",
"https://doctor.webmd.com/providers/new-york/smithtown",
"https://doctor.webmd.com/providers/new-york/shelter-island",
"https://doctor.webmd.com/providers/new-york/east-hampton",
"https://doctor.webmd.com/providers/new-york/southampton",
"https://doctor.webmd.com/providers/new-york/riverhead"
]
db_urls = [
"sqlite:///sites_crawled1.db3",
"sqlite:///sites_crawled2.db3",
"sqlite:///sites_crawled3.db3",
"sqlite:///sites_crawled4.db3",
"sqlite:///sites_crawled5.db3",
"sqlite:///sites_crawled6.db3",
"sqlite:///sites_crawled7.db3",
"sqlite:///sites_crawled8.db3",
"sqlite:///sites_crawled9.db3",
"sqlite:///sites_crawled10.db3",
]
# Create and start threads
threads = []
for i in range(len(base_urls)):
thread = threading.Thread(target=extract_and_save_urls, args=(base_urls[i], db_urls[i]))
threads.append(thread)
thread.start()
# Wait for all threads to finish
for thread in threads:
thread.join()
print("All URLs processed and saved to the respective databases.")
import requests
from bs4 import BeautifulSoup
import re
import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text
from sqlalchemy.ext.declarative import declarative_base
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service # Импорт класса Service
from selenium.webdriver.common.by import By
from multiprocessing import Pool
import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
street_suffixes = [' Boulevard ', ' BOULEVARD ', ' Broadway ', ' BROADWAY ', ' Building ', ' BUILDING ', ' Parkway ', ' PARKWAY ', ' Highway ', ' HIGHWAY ', ' Avenue ', ' AVENUE ', ' Street ', ' STREET ', ' Drive ', ' DRIVE ', ' Circle ', ' CIRCLE ', ' Place ', ' PLACE ', ' Avenue ', ' ROAD ', ' Road ', ' Hwy ', ' Lane ', ' LANE ', ' WAY ', ' Way ', ' Dr ', ' DR ', ' Ave, ', ' Ave ', ' AVE ', ' St ', ' ST ', ' Rd ', ' RD ', ' Hw ', ' Blvd ', ' BLVD ', ' Ln ', ' 25a ', ' 25A ', ' Pkwy ', ' PKWY ', ' Grn ', ' Ste ', ' STE ', ' Ctr ', ' HWY ', ' Plz ',' PLZ ']
driver_path = 'chromedriver.exe'
service = Service(driver_path)
driver = webdriver.Chrome(service=service)
def extract_address_parts_webmd(address):
parts_pattern = r'\(([^)]*)\)'
parts = re.findall(parts_pattern, address)
if len(parts) < 4:
return None, None, None, None, None
street = parts[0].strip()
city = parts[1].strip().rstrip(',')
state = parts[2].strip().rstrip(',')
zipcode = parts[3].strip()
street_internal = None
for suffix in street_suffixes:
if suffix in street:
split_street = street.split(suffix, 1)
if len(split_street) > 1:
street = split_street[0] + suffix
street_internal = split_street[1].strip().rstrip('.')
return street, street_internal, city, state, zipcode
def parse_single_address(address_str):
address_dict = {}
phone_number = None
street, street_internal, city, state, zipcode = extract_address_parts_webmd(address_str)
if ' Ste ' in street or ' STE ' in street or ' UNIT ' in street or ' Suite ' in street or ' APT ' in street or ' Bldg ' in street:
substring = ' Ste ' if ' Ste ' in street else 'STE ' if ' STE ' in street else ' UNIT ' if ' UNIT ' in street else ' Suite ' if ' Suite ' in street else ' APT ' if ' APT ' in street else ' Bldg '
parts = street.split(substring, 1)
street = parts[0].strip()
street_internal = substring + parts[1]
address_dict['Street'] = street
address_dict['Street_Internal'] =street_internal
address_dict['City'] = city
address_dict['State'] = state
address_dict['ZipCode'] = zipcode
return address_dict, phone_number
def extract_name_parts(full_name):
title, first_name, middle_name, last_name, prefix, suffix = '', '', '', '', '', ''
titles = ["Dr", "Md"]
prefixes = ['Jr', 'Sr', 'II', 'III', 'IV', 'V']
suffixes = ['MD', 'PhD', 'DO', 'NPC', 'NP', 'DPM', 'DC', 'FACS','DDS', 'OD', 'PsyD', 'RNP', 'DMD']
words = full_name.split()
if words[0].startswith(tuple(titles)):
title = words.pop(0).rstrip(".")
if words and words[-1] in suffixes:
suffix = words.pop()
while words and words[-1].rstrip(',') in suffixes:
suffix = words.pop().rstrip(',') + ', ' + suffix
elif ',' in words[-1]:
last_word = words.pop().split(',')
if last_word[-1] in suffixes:
suffix = last_word[-1]
words.append(last_word[0])
if words:
first_name = words.pop(0)
if words:
last_name = words.pop().rstrip(',')
if last_name in prefixes:
prefix = last_name
last_name = words.pop().rstrip(',')
if words:
middle_name = " ".join(words).rstrip(".")
return title, first_name, middle_name, last_name, prefix, suffix
def scrape_doctor_info(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
doctor_name_title = soup.find('h1', {'class':'provider-full-name'})
if doctor_name_title:
title, first_name, middle_name, last_name, prefix, suffix = extract_name_parts(doctor_name_title.text)
else:
title, first_name, middle_name, last_name, prefix, suffix = 'None', 'None', 'None', 'None', 'None', 'None'
doctor_specialty = soup.find('div', {'class':'prov-specialties-wrap'})
doctor_education = soup.findAll('div', {"class":"education-wrapper webmd-row"})
street_elements = soup.findAll('div', {'class': "location-address loc-coi-locad webmd-row"})
streets = [text.get_text().strip() for text in street_elements]
city_elements = soup.findAll('span', {'class': "location-city loc-coi-loccty"})
cities = [text.get_text().strip() for text in city_elements]
state_elements = soup.findAll('span', {'class': "location-state loc-coi-locsta"})
states = [text.get_text().strip() for text in state_elements]
zipcode_elements = soup.findAll('span', {'class': "location-zipcode loc-coi-loczip"})
zipcodes = [text.get_text().strip() for text in zipcode_elements]
phone_elements = soup.findAll('div', {'class': "location-phone webmd-row"})
phones = [text.get_text().strip() for text in phone_elements]
phones += [None] * (len(streets) - len(phones))
address_list = [
f"({street})({city})({state})({zipcode}){' ' + phone if phone else ''}"
for street, city, state, zipcode, phone in zip(streets, cities, states, zipcodes, phones)
]
address_str = ' || '.join(address_list)
if address_str.count("||") > 3:
driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)
try:
ad_button1 = driver.find_element(By.XPATH, "//*[@id=\"adserved-ppg\"]/div/div/div[1]/div[2]")
ad_button1.click()
except:
pass
try:
ad_button2 = driver.find_element(By.XPATH, "//*[@id=\"adserved-ppg\"]/div/div/div/div/div[1]")
ad_button2.click()
except:
pass
try:
ad_button3 = driver.find_element(By.XPATH, "//*[@id=\"bottombannerad\"]/button")
ad_button3.click()
except:
pass
try:
show_more_button = driver.find_element(By.XPATH, "//*[@id=\"office-info\"]/div/div[2]/div[2]/button")
show_more_button.click()
except:
pass
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
address_str = ' || '.join(address_list)
driver.close()
university = [text.get_text() for text in doctor_education]
university_str = ' || '.join(university)
street_internal = None
street = '' if 'street' not in locals() else street
doctor_data = {
"Website": "WebMD",
"Name": doctor_name_title.text,
"Title": title,
"FirstName": first_name,
"MiddleName": middle_name,
"LastName": last_name,
"Prefix": prefix,
"Suffix": suffix,
"Specialty": ' | '.join(doctor_specialty.text.split(' ')),
"University": university_str,
"Address": address_str,
"Street_Internal": street_internal,
"PhoneNumber": 'none'
}
if not '||' in doctor_data['Address']:
parsed_address, phone_number = parse_single_address(doctor_data['Address'])
doctor_data['Address'] = json.dumps(parsed_address)
doctor_data['PhoneNumber'] = phone_number
return doctor_data
def extract_url_key(url):
match = re.search(r'([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})', url)
return match.group(0) if match else None
DATABASE_URLS = [
"sqlite:///sites_crawled1.db3",
"sqlite:///sites_crawled2.db3",
"sqlite:///sites_crawled3.db3",
"sqlite:///sites_crawled4.db3",
"sqlite:///sites_crawled5.db3",
"sqlite:///sites_crawled6.db3",
"sqlite:///sites_crawled7.db3",
"sqlite:///sites_crawled8.db3",
"sqlite:///sites_crawled9.db3",
"sqlite:///sites_crawled10.db3"
]
Base = declarative_base()
class SitesCrawled(Base):
__tablename__ = 'sites_crawled'
id = Column(Integer, primary_key=True)
site = Column(String(1024))
class DoctorScrapedInfo(Base):
__tablename__ = 'scraped_doctors_info'
id = Column(Integer, primary_key=True)
site_crawled_id = Column(Integer, nullable=False)
url_key = Column(String(1024))
site = Column(String(1024), nullable=False)
website = Column(String(1024))
name = Column(String(1024))
title = Column(String(1024))
first_name = Column(String(1024))
middle_name = Column(String(1024))
last_name = Column(String(1024))
personal_suffix = Column(String(1024))
clinical_suffix = Column(String(1024))
specialty = Column(String(1024))
university = Column(String(1024))
street = Column(String(1024))
street_internal = Column(String(1024))
city = Column(String(1024))
state = Column(String(1024))
zip_code = Column(String(1024))
phone_number = Column(String(1024))
composite_key = Column(String(1024))
@property
def generate_composite_key(self):
return f"{self.first_name}_{self.last_name}_{self.street}_{self.zip_code}"
class MultipleAddresses(Base):
__tablename__ = 'multiple_addresses'
id = Column(Integer, primary_key=True)
doctor_scraped_info_id = Column(Integer, nullable=False)
street = Column(String(1024))
street_internal = Column(String(1024))
city = Column(String(1024))
state = Column(String(1024))
zip_code = Column(String(1024))
phone_number = Column(String(1024))
composite_key = Column(String(1024))
# Define your process_database function
def process_database(db_url):
# SQLAlchemy setup
engine = sa.create_engine(db_url)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
# Create a new session for each database
session = Session()
urls = session.query(SitesCrawled.id, SitesCrawled.site).all()
for url in urls:
try:
doctor_info = scrape_doctor_info(url[1])
if doctor_info:
title, first_name, middle_name, last_name, prefix, suffix = extract_name_parts(doctor_info['Name'])
street = "multiple addresses" if '||' in doctor_info['Address'] else ''
street_internal = None
city = None
state = None
zip_code = None
if not street:
address_dict = json.loads(doctor_info['Address'])
street = address_dict.get('Street')
street_internal = address_dict.get('Street_Internal')
city = address_dict.get('City')
state = address_dict.get('State')
zip_code = address_dict.get('ZipCode')
if street:
doctor = DoctorScrapedInfo(
site_crawled_id=url[0], site=url[1], website=doctor_info['Website'],
name=doctor_info['Name'], title=title, first_name=first_name,
middle_name=middle_name, last_name=last_name, personal_suffix=prefix,
clinical_suffix=suffix, specialty=doctor_info['Specialty'],
university=doctor_info['University'], street=street,
street_internal=street_internal, city=city,
state=state, zip_code=zip_code, phone_number=doctor_info['PhoneNumber']
)
# Setting composite_key and url_key after creating the doctor instance
doctor.composite_key = doctor.generate_composite_key
doctor.url_key = extract_url_key(url[1])
session.add(doctor)
session.commit()
session.flush()
if '||' in doctor_info['Address']:
addresses = doctor_info['Address'].split('||')
for address in addresses:
address = address.strip()
if address:
phone_number = None
tel_index = address.find('Tel:')
if tel_index != -1:
phone_number = address[tel_index+len('Tel:'):].strip()
address = address[:tel_index].strip()
street, street_internal, city, state, zipcode = extract_address_parts_webmd(address)
multiple_address = MultipleAddresses(
doctor_scraped_info_id=doctor.id,
street=street,
street_internal=street_internal,
city=city,
state=state,
zip_code=zipcode,
phone_number=phone_number
)
# Generate the Composite Key using data from DoctorScrapedInfo
composite_key = f"{doctor.first_name}_{doctor.last_name}_{street}_{zipcode}"
multiple_address.composite_key = composite_key
session.add(multiple_address)
session.commit()
except Exception as e:
print(f"Error processing URL {url[1]}: {str(e)}\n")
session.close()
# Ensure the multiprocessing code only runs when the script is executed directly
if __name__ == '__main__':
# Create a pool of worker processes
with Pool(len(DATABASE_URLS)) as pool:
# Map the process_database function to each database URL
pool.map(process_database, DATABASE_URLS)