C# Regex 获取<table></table>的html

发布时间 2023-08-18 17:59:20作者: hofmann
        [Fact]
        public void Regex_Test()
        {
            var rowHtml = @"<div class=\""container\"">
    <div class=\""title dottedline\"">XXXXXX股份有限公司-受益所有人查询结果</div>
</div>
<div class=\""container\"">
    <table class=\""gridtable\"">
        <tr>
            <th>序号</th>
            <th>名称</th>
            <th>注册币种</th>
            <th>成立日期</th>
            <th>注册资本</th>
            <th>统一社会信用代码</th>
            <th>持股占比</th>
            <th>投资类型描述</th>
            <th>是否上市</th>
            <th>是否是受益人</th>
            <th>受益人类型</th>
            <th>职务</th>
        <tr>
        <tr>
            <td>1</td>
            <td>XXXXXX股份有限公司</td>
            <td>人民币元</td>
            <td>1985-11-22</td>
            <td>35640625.708900</td>
            <td>91100000100003962T</td>
            <td></td>
            <td></td>
            <td>是</td>
            <td>否</td>
            <td></td>
            <td></td>
        </tr>
        <tr>
            <td>2</td>
            <td>铎梦之</td>
            <td></td>
            <td></td>
            <td></td>
            <td></td>
            <td></td>
            <td></td>
            <td></td>
            <td>是</td>
            <td>关键管理人员</td>
            <td>董事</td>
        </tr>
    </table>
</div>";
            if (!string.IsNullOrEmpty(rowHtml))
            {
                var regex = new Regex(@"<table.*?>[\s\S]*?<\/table>");
                var mc = regex.Matches(rowHtml);
                if (mc.Count > 0)
                {
                    foreach (var item in mc)
                    {
                        var tableHtml = item;
                        /*
                            <table class=\"gridtable\">
                                <tr>
                                    <th>序号</th>
                                    <th>名称</th>
                                    <th>注册币种</th>
                                    <th>成立日期</th>
                                    <th>注册资本</th>
                                    <th>统一社会信用代码</th>
                                    <th>持股占比</th>
                                    <th>投资类型描述</th>
                                    <th>是否上市</th>
                                    <th>是否是受益人</th>
                                    <th>受益人类型</th>
                                    <th>职务</th>
                                <tr>
                                <tr>
                                    <td>1</td>
                                    <td>XXXXXX股份有限公司</td>
                                    <td>人民币元</td>
                                    <td>1985-11-22</td>
                                    <td>35640625.708900</td>
                                    <td>91100000100003962T</td>
                                    <td></td>
                                    <td></td>
                                    <td>是</td>
                                    <td>否</td>
                                    <td></td>
                                    <td></td>
                                </tr>
                                <tr>
                                    <td>2</td>
                                    <td>铎梦之</td>
                                    <td></td>
                                    <td></td>
                                    <td></td>
                                    <td></td>
                                    <td></td>
                                    <td></td>
                                    <td></td>
                                    <td>是</td>
                                    <td>关键管理人员</td>
                                    <td>董事</td>
                                </tr>
                            </table>
                        */
                    }
                }

                Regex regTD = new Regex(@"(?<=<td>)(.*?)(?=</td>)", RegexOptions.IgnoreCase);//[^(<td>))] 
                var mc2 = regTD.Matches(rowHtml);
                if (mc2.Count > 0)
                {
                    var tdText = string.Empty;
                    foreach (var item in mc2)
                    {
                        tdText += $"{item},";
                    }
                    tdText = tdText.TrimEnd(new char[] { ',' });
                    /*
                     * 1,XXXXXX股份有限公司,人民币元,1985-11-22,35640625.708900,91100000100003962T,,,是,否,,,2,铎梦之,,,,,,,,是,关键管理人员,董事
                    */
                }

            }

        }