.net抓取html文本中的链接集合

发布时间 2023-11-24 11:46:23作者: Chanwah
public static void GetListHtmlString(string content, string searchStr, List<string> list)
{
if (string.IsNullOrEmpty(content) || string.IsNullOrEmpty(searchStr))
return;

int thisIndex = 0;
while (true)
{
int startIndex = content.IndexOf(searchStr, thisIndex);
if (startIndex == -1)
break;

startIndex += searchStr.Length;

int endIndex = content.IndexOf("\"", startIndex);
if (endIndex == -1)
break;

var listChar = content.Skip(startIndex).Take(endIndex - startIndex).ToArray();
string str = new string(listChar);
if (str.Contains("data:image/png;base64"))
continue;
if (/*str.ToLower().StartsWith(Aide.Domain.ToLower()) && */!list.Contains(str)) { list.Add(str); }

thisIndex = endIndex;
}
return;
}

 


 

调用:

 var html = new StreamReader("C:\\html.txt").ReadToEnd();
 var list = new List<string>();
 GetListHtmlString(db.CanceledRemark, "src=\"", list);
 GetListHtmlString(db.CanceledRemark, "href=\"", list);
 return SuccessMsg(list);

返回结果: