C#获取HTML源码 2024年03月23日记录 以前的那个从网上找到的方法, 在一些网站上用不了,如17K,取出来的是乱码,要么就是一坨JS,好像是用JS又重新加载了什么的
using System; using System.Collections.Generic; using System.Web; using System.Net; using System.IO; using System.Text; using System.Net.Security; using System.Security.Authentication; using System.Security.Cryptography.X509Certificates; namespace Niunan.XiaoShuo.Util { /// <summary> /// http连接基础类,负责底层的http通信 /// </summary> public class HttpService { public static bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors) { //直接确认,否则打不开 return true; } /// <summary> /// post提交 /// </summary> /// <param name="xml"></param> /// <param name="url"></param> /// <param name="isUseCert"></param> /// <param name="timeout"></param> /// <param name="contenttype">如:application/x-www-form-urlencoded,text/xml</param> /// <param name="Authorization">为空的时候就不用加,用于容联云通讯</param> /// <returns></returns> public static string Post(string xml, string url, bool isUseCert, int timeout,string contenttype = "application/x-www-form-urlencoded",string Authorization="") { System.GC.Collect();//垃圾回收,回收没有正常关闭的http连接 string result = "";//返回结果 HttpWebRequest request = null; HttpWebResponse response = null; Stream reqStream = null; try { //设置最大连接数 ServicePointManager.DefaultConnectionLimit = 200; //设置https验证方式 if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase)) { ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult); } /*************************************************************** * 下面设置HttpWebRequest的相关属性 * ************************************************************/ request = (HttpWebRequest)WebRequest.Create(url); request.Method = "POST"; request.Timeout = timeout * 1000; if (!string.IsNullOrEmpty(Authorization)) { request.Headers.Add(HttpRequestHeader.Authorization, Authorization); } //设置代理服务器 //WebProxy proxy = new WebProxy(); //定义一个网关对象 //proxy.Address = new Uri(WxPayConfig.PROXY_URL); //网关服务器端口:端口 //request.Proxy = proxy; //设置POST的数据类型和长度 request.ContentType =contenttype; byte[] data = System.Text.Encoding.UTF8.GetBytes(xml); request.ContentLength = data.Length; //是否使用证书 if (isUseCert) { //复制微信DEMO的,这里不用证书 //string path = HttpContext.Current.Request.PhysicalApplicationPath; //X509Certificate2 cert = new X509Certificate2(path + WxPayConfig.SSLCERT_PATH, WxPayConfig.SSLCERT_PASSWORD); //request.ClientCertificates.Add(cert); //Log.Debug("WxPayApi", "PostXml used cert"); } //往服务器写入数据 reqStream = request.GetRequestStream(); reqStream.Write(data, 0, data.Length); reqStream.Close(); //获取服务端返回 response = (HttpWebResponse)request.GetResponse(); //获取服务端返回数据 StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8); result = sr.ReadToEnd().Trim(); sr.Close(); } catch (Exception e) { // Log.Error("HttpService", e.ToString()); throw e; } finally { //关闭连接和流 if (response != null) { response.Close(); } if(request != null) { request.Abort(); } } return result; } /// <summary> /// 处理http GET请求,返回数据 /// </summary> /// <param name="url">请求的url地址</param> /// <returns>http GET成功后返回的数据,失败抛WebException异常</returns> public static string Get(string url) { System.GC.Collect(); string result = ""; HttpWebRequest request = null; HttpWebResponse response = null; //请求url以获取数据 try { //设置最大连接数 ServicePointManager.DefaultConnectionLimit = 200; //设置https验证方式 if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase)) { ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult); } /*************************************************************** * 下面设置HttpWebRequest的相关属性 * ************************************************************/ request = (HttpWebRequest)WebRequest.Create(url); request.Method = "GET"; //设置代理 //WebProxy proxy = new WebProxy(); //proxy.Address = new Uri(WxPayConfig.PROXY_URL); //request.Proxy = proxy; //获取服务器返回 response = (HttpWebResponse)request.GetResponse(); //获取HTTP返回数据 StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8); result = sr.ReadToEnd().Trim(); sr.Close(); } catch (Exception e) { throw e; } finally { //关闭连接和流 if (response != null) { response.Close(); } if (request != null) { request.Abort(); } } return result; } } }
弄了一上午,到处问人到处查,发现下面的代码可以用于17K网站,
var handler = new HttpClientHandler() { AutomaticDecompression = System.Net.DecompressionMethods.GZip | System.Net.DecompressionMethods.Deflate, UseCookies=false, }; var httpClient = new HttpClient(handler); var requestMessage = new HttpRequestMessage(HttpMethod.Get, url); requestMessage.Headers.Add("Accept-encoding", "gzip, deflate, br, zstd"); var message = await httpClient.SendAsync(requestMessage); var content = await message.Content.ReadAsStringAsync(); //后来发现这段代码前几次可以抓取到,然后又抓不到了。。只能用下面的模拟浏览器打开网页抓取源代码了
后来又来了个更狠的,用PuppeteerSharp, 相当于用代码来控制让系统中的chrome浏览器打开一个网页,然后再来获取这个网页的源代码
using PuppeteerSharp; //nuget引入一下 namespace ConsoleApp2 { internal class Program { static async Task Main(string[] args) { await new BrowserFetcher().DownloadAsync(BrowserTag.Stable); //自动下载他提供的无头浏览器,不用这一行就得在下面指定本地的浏览器 var browser = await Puppeteer.LaunchAsync(new LaunchOptions { //ExecutablePath= "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", Headless = true }); var page = await browser.NewPageAsync(); await page.GoToAsync("https://www.17k.com/book/554720.html"); await page.WaitForTimeoutAsync(2000); string html = await page.GetContentAsync(); Console.WriteLine(html); await browser.CloseAsync(); } } }
然后还有一个playwright的也能实现操作浏览器打开网页的功能,用于自动化测试的,以前有记录过这个名字,不过一直没有时间看。。。主要是“懒”。。。。。 https://playwright.dev/dotnet/docs/intro 标签:HTML,C#,request,System,源码,using,new,response,string From: https://www.cnblogs.com/niunan/p/18091019