异步爬虫demo2

发布时间 2023-06-04 15:13:56作者: 无情の王子
import re
import aiohttp
import asyncio


class Asyn:
    def __init__(self):
        self.__headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
        }

    async def fetch(self, session, url):
        print("Sending request:", url)
        async with session.get(url, headers=self.__headers) as response:
            content = await response.text()
            poetry_infos = re.findall(r'textarea style=".*>(.*?)——(.*?)https://so.gushiwen.cn/shiwenv_.*.aspx</textarea>', content)
            for item in poetry_infos:
                print(item)
                # Uncomment the lines below to write the data to a file
                with open('古诗内容111.txt', 'a', encoding='utf-8') as f:
                    f.write(f'{item}\n')

    async def main(self):
        async with aiohttp.ClientSession() as session:
            url = "https://so.gushiwen.org/gushi/tangshi.aspx"
            html = await session.get(url, headers=self.__headers)
            lsc = re.findall(r'<span><a href="/(.*)" target="_blank">.*</a>.*</span>', await html.text())
            tasks = [self.fetch(session, f"https://so.gushiwen.org/{path}") for path in lsc]
            await asyncio.gather(*tasks)

# Create an instance and run the main method
asyn = Asyn()
# asyncio.run(asyn.main())  # this error so use
loop = asyncio.get_event_loop()
loop.run_until_complete(asyn.main())