.Net多线程读取pdf文本

发布时间 2023-09-14 17:40:38作者: 十三年新*

1.nuget安装UglyToad.PdfPig

2.SemaphoreSlim semaphore = new SemaphoreSlim(10);同时启动10个线程读取指定页面文本。

C#代码:

static string GetPdfText(string filePath)
{
    FileInfo file = new FileInfo(filePath);
    if (file.Extension.ToLower().Contains("pdf"))
    {
        try
        {
            var sb = new StringBuilder();
            int taskCount;
            using (UglyToad.PdfPig.PdfDocument document = UglyToad.PdfPig.PdfDocument.Open(file.FullName))
            {
                taskCount = document.NumberOfPages;
            }
            List<Task<Tuple<int, string>>> tasks = new List<Task<Tuple<int, string>>>();
            SemaphoreSlim semaphore = new SemaphoreSlim(10);
            for (int i = 0; i < taskCount; i++)
            {
                int taskNumber = i;
                Task<Tuple<int, string>> task = Task.Run(async () =>
                {
                    await semaphore.WaitAsync();
                    try
                    {
                        return PdfPageText(file.FullName, taskNumber);
                    }
                    finally
                    {
                        semaphore.Release();
                    }
                });
                tasks.Add(task);
            }
            Task.WaitAll(tasks.ToArray());
            foreach (var task in tasks.OrderBy(x => x.Result.Item1))
            {
                sb.Append(task.Result.Item2);
            }
            return sb.ToString();
        }
        catch (Exception)
        {
            // ignored
        }
    }
    return string.Empty;
}

static Tuple<int, string> PdfPageText(string fullName, int i)
{
    using (UglyToad.PdfPig.PdfDocument document = UglyToad.PdfPig.PdfDocument.Open(fullName))
    {
        var page = document.GetPage(i + 1);
        return new Tuple<int, string>(i + 1, page.Text);
    }
}