Ускорение парсинга с помощью asyncio

Рейтинг: 0Ответов: 0Опубликовано: 21.07.2023

мне нужно как то ускорить парсинг до 3 сек. возможно ли это? там 3000 объявлений по 1000 по 3 категориям

import aiohttp
import json
import time
import asyncio

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "Accept": "application/json, text/plain, */*",
    "device": "pc"
}

categories = [
    {"id": 5830, "name": "Продажа автомобилей"},
    {"id": 2043, "name": "Аренда квартир"},
    {"id": 2046, "name": "Продажа квартир"}
]

total_ads_limit_per_category = 1000


async def get_ads(session, category):
    category_id = category["id"]
    category_name = category["name"]

    ads_fetched = 0
    page = 1
    ads_data = []

    while ads_fetched < total_ads_limit_per_category:
        url = f"http://lalafo.kg/api/search/v3/feed/search?expand=url&per-page=40&category_id={category_id}&page={page}"
        async with session.get(url, headers=headers) as response:
            response_data = await response.json()

        items = response_data.get('items', [])
        num_ads = min(40, len(items))
        for i in range(num_ads):
            item = items[i]
            title = item["title"]
            description = item["description"]
            price = item["price"]
            currency = item["currency"]
            city = item["city"]
            mobile = item["mobile"]
            images = [image["original_url"] for image in item["images"]]
            author = item["user"]["username"]

            ad_data = {
                "title": title,
                "description": description,
                "price": price,
                "currency": currency,
                "city": city,
                "mobile": mobile,
                "images": images,
                "category": category_name,
                "author": author
            }

            ads_data.append(ad_data)
            ads_fetched += 1

        page += 1

    return ads_data


async def send_ads_data(data):
    url = 'http://127.0.0.1:8000'
    async with aiohttp.ClientSession() as session:
        async with session.post(url, json=data) as response:
            return await response.text()

async def main():
    async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(limit=1000)) as session:
        tasks = [get_ads(session, category) for category in categories]
        results = await asyncio.gather(*tasks)

        all_ads_data = []
        for ads_data in results:
            all_ads_data.extend(ads_data)

        await send_ads_data(all_ads_data)

if __name__ == "__main__":
    start_time = time.time()
    asyncio.run(main())
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Время выполнения скрипта: {elapsed_time} секунд")

Ответы

Ответов пока нет.