Не удается получить данные с сайта

Question

Не удается получить данные с сайта

Рейтинг: 1Ответов: 1Опубликовано: 26.06.2023

Пишу парсер для сайта https://www.shibor.org/shibor/dataservicesen/

Необходимо получить данные с фрейма Shibor

Имеется код

import requests
from bs4 import BeautifulSoup
import re

URL  = 'https://www.shibor.org/shibor/dataservicesen/'

HEADERS = {'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"}

def get_html(url):
    return requests.get(url, headers=HEADERS)

def get_content_iframe(html):
    soup = BeautifulSoup(html, 'html.parser')
    pattern = r'(src=\"/[A-za-z/\-\.]+shibor\-def\-down\-iframe\-e\.html\")'
    src = re.search(pattern, str(soup))
    new_url = 'https://www.shibor.org' + src[0].split('"')[1]
    html_new = get_html(new_url)
    new_soup = BeautifulSoup(html_new.text, 'html.parser')
    return new_soup

def parser(url):
    html = get_html(url)
    if html.status_code == 200:
        return get_content_iframe(html.text)
    else:
        return None

new_soup = parser(URL)
qw = new_soup.find('div', id_="page-shibor-history")
print(qw)

Но в ответе получаю None, хотя если просматривать html такой тег с таким id существует.

Необходимо получить данные из этой таблицы

python html5 beautiful-soup

Источник: Stack Overflow на русском

Answer 1

▲ 0

вам нужно использовать

find('div', id="page-shibor-history")

а не

find('div', id_="page-shibor-history")

вот рабочий код можете его проанализировать.

import requests
from bs4 import BeautifulSoup
import re

URL = 'https://www.shibor.org/shibor/dataservicesen/'
HEADERS = {'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"}

def get_html(url):
    response = requests.get(url, headers=HEADERS)
    if response.status_code == 200:
        return response
    else:
        print(response.status_code)

def get_content_iframe(html):
    soup = BeautifulSoup(html.text, 'html.parser')
    pattern = r'(src=\"/[A-za-z/\-\.]+shibor\-def\-down\-iframe\-e\.html\")'
    src = re.search(pattern, str(soup))
    new_url = 'https://www.shibor.org' + src[0].split('"')[1]
    response = get_html(new_url)
    return response

def parser(url):
    response = get_html(url)
    if response.status_code == 200:
        return get_content_iframe(response)
    else:
        print(response.status_code)

def main():
    html = parser(URL)
    soup = BeautifulSoup(html.text, 'html.parser')
    qw = soup.find('div', id="page-shibor-history")
    print(qw)


if __name__ == "__main__":
    main()

Так как сайт использует js для загрузги сайта будем использовать playwright

Устанавливаем playwright

pip install --upgrade pip
pip install playwright
playwright install

Вот обнов код

import requests
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import re



URL = 'https://www.shibor.org/shibor/dataservicesen/'
HEADERS = {'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"}

def get_html(url):
    with sync_playwright() as p:
        browser = p.chromium.launch()
        page = browser.new_page()
        page.set_default_timeout(0)
        page.goto(url)
        page.wait_for_selector("div[id='shibor-his-cn']")
        return page.content()


def get_content_iframe(html):
    soup = BeautifulSoup(html.text, 'html.parser')
    pattern = r'(src=\"/[A-za-z/\-\.]+shibor\-def\-down\-iframe\-e\.html\")'
    src = re.search(pattern, str(soup))
    new_url = 'https://www.shibor.org' + src[0].split('"')[1]
    response = get_html(new_url)
    return response

def parser(url):
    response = requests.get(url)
    if response.status_code == 200:
        return get_content_iframe(response)
    else:
        print(response.status_code)

def main():
    html = parser(URL)
    soup = BeautifulSoup(html, 'html.parser')
    qw = soup.find('div', id="page-shibor-history")
    print(qw)



if __name__ == "__main__":
    main()

Не удается получить данные с сайта

Ответы