爬虫获取网页开发者模式NetWork信息

发布时间 2023-11-21 17:36:48作者: 后跳
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using OpenQA.Selenium;
using OpenQA.Selenium.DevTools;
using OpenQA.Selenium.DevTools.V85.Network;
using DevToolsSessionDomains = OpenQA.Selenium.DevTools.V85.DevToolsSessionDomains;

// 2023.11.20 author by Zingu ft. NewBing

namespace PuppeteerSharp_Test
{
    public class Demo
    {
        // 构造方法按需设置
        public Demo() { }

        private List<Dictionary<string, string>> _responses = new List<Dictionary<string, string>>();

        // 线程锁
        private object obj_lock = new object();


        public async Task<List<Tuple<string, string>>> GetListAsync(string url)
        {
            List<Tuple<string, string>> ls = new List<Tuple<string, string>>();
            using (IWebDriver driver = new OpenQA.Selenium.IE.InternetExplorerDriver())
            {
                driver.Navigate().GoToUrl(url);

                Thread.Sleep(500);
                var videoElements1 = driver.FindElement(By.ClassName("course-list"));
               var videoElements=  videoElements1.FindElements(By.ClassName("course-link"));
                foreach (var item in videoElements)
                {
                    var aElement = item.FindElement(By.TagName("a"));
                    string href = aElement.GetAttribute("href");
                    string title = aElement.GetAttribute("title");
                    ls.Add(new Tuple<string, string>(href, title));
                }
            }
            return ls;
        }


        /// <summary>
        /// 主要使用逻辑, 异步方法
        /// </summary>
        /// <returns></returns>
        public async Task RunAsync()
        {
            // 初始化一个驱动, 本例中未设置 options 参数
            // var dr = new OpenQA.Selenium.Chrome.ChromeDriver();
             var dr = new OpenQA.Selenium.Edge.EdgeDriver();
            // 初始化 session
            var session = dr.GetDevToolsSession(85);
            // 初始化 domains
            var domains = session.GetVersionSpecificDomains<DevToolsSessionDomains>();
            // 设置 Network 为 Enable
            await domains.Network.Enable(new OpenQA.Selenium.DevTools.V85.Network.EnableCommandSettings());
            // 订阅 接收 Response 事件
            domains.Network.ResponseReceived += Network_ResponseReceived;
            // 打开目标网站
            dr.Url = "https://open.163.com/newview/movie/free?pid=MA5T0OVML&mid=MA5T1488U";
            // 设置 获取 Response body 的参数
            var cmd = new OpenQA.Selenium.DevTools.V85.Network.GetResponseBodyCommandSettings();

            await Task.Delay(500);
            foreach (var item in GetRequestUrl())
            {
                Console.WriteLine(item);
            }
            /*
            // 获取 RequestId 加入参数中
            cmd.RequestId = GetRequestId();
            // rlt 是最后获得的 Response body
            var rlt = domains.Network.GetResponseBody(cmd).GetAwaiter().GetResult();
            // 输出得到的结果
            //Console.WriteLine(rlt.Body);
            */
            // 设置 Network 为不可用
            await domains.Network.Disable();
            // 关闭驱动
            dr.Quit();
            Console.ReadLine();
        }

        private List<string> GetRequestUrl()
        {
            List<string> rlt =new List<string>();
            List<Dictionary<string, string>> box;
            // responses List 要加锁
            lock (obj_lock)
            {
                box = _responses.ToList();
            }
            foreach (var u in box)
            {
                // 结合需求设置
                // if (u["url"] != null&& (u["url"].Contains(".mp4")|| u["url"].Contains(".srt")))// <筛选的条件>
                if (u["url"] != null && (u["url"].Contains(".mp4") || u["url"].Contains(".srt")))
                    rlt.Add(u["url"]);      
            }
            return rlt;
        }


        /// <summary>
        /// 获取 RequestId
        /// </summary>
        /// <returns></returns>
        private string GetRequestId()
        {
            string rlt = "";
            List<Dictionary<string, string>> box;
            // responses List 要加锁
            lock (obj_lock)
            {
                box = _responses.ToList();
            }
            foreach (var u in box)
            {
                // 结合需求设置
                // if (u["url"] != null&& (u["url"].Contains(".mp4")|| u["url"].Contains(".srt")))// <筛选的条件>
                if (u["url"] != null && (u["url"].Contains(".mp4") || u["url"].Contains(".srt")))
                    Console.WriteLine(u["url"]);
                if (u["url"] != null )// <筛选的条件>
                {
                    rlt = u["requestId"];
                }
                else
                {
                    continue;
                }
            }
            return rlt;
        }

        /// <summary>
        /// 接收 Response 事件
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void Network_ResponseReceived(object sender, OpenQA.Selenium.DevTools.V85.Network.ResponseReceivedEventArgs e)
        {
            var dic = new Dictionary<string, string>();
            // e 中是接收到的 Response 信息, 本例中只需求 requestId 和 url
            dic.Add("requestId", e.RequestId);
            dic.Add("url", e.Response.Url);
            // responses List 要加锁
            lock (obj_lock)
            {
                _responses.Add(dic);

            }
        }
    }
}