java多线程爬取笔趣阁所有小说

发布时间 2023-09-04 10:05:48作者: 苦逼vs猴子

可以选择下载的数量,全部下载下来够呛,首先没那么大的盘
新版本:https://wws.lanzous.com/iAEMoghsgeb 密码:7vjz
jar包:https://wws.lanzous.com/ilphyghsgcj密码:f38a

		<dependency>
            <!-- jsoup HTML parser library [url=home.php?mod=space&uid=402414]@[/url] https://jsoup.org/ -->
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.2</version>
		</dependency>

核心代码展示;

package com.aaa.data;
 
import com.aaa.config.SSLHelper;
import com.aaa.dto.BookCatalogueDto;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
 
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
/**
 * @author 三木猿
 * @version 1.0
 * @Title:
 * @date 2020/8/10 15:16
 */
public class DownloadBook {
    private static String dataSource;
    private static Pattern pattern = Pattern.compile("<a\\s*href=\"?([\\w\\W]*?)\"?[\\s]*?[^>]>([\\s\\S]*?)(?=</a>)");
 
    public static void setDataSource(String dataSource,int count) {
        SSLHelper.init();
        DownloadBook.dataSource = dataSource;
        if ("biquge5200".equals(dataSource)) {
            while (true) {
                Thread thread1 = new Thread(() -> {
                    for (int i = 1; i <(count>=2?count/2:count); i++) {
                        try {
                            String bookCod = "0_" + i;
                            Document document = Jsoup.connect("https://www.biquge5200.com/" + bookCod + "/").get();
                            Element info = document.getElementById("info");
                            String bookName = info.select("h1").text();
                            String path = "/usr/local/webapps/file/" + bookName + ".txt";
                            File file = new File(path);
                            if (file.exists()) {
                                continue;
                            }
                            System.out.println("---------------" + bookName + "正在下载" + "--------------");
                            List<BookCatalogueDto> bookCatalogue = getBookCatalogue(bookCod, document, pattern);
                            downloadBook(bookCod, bookName, bookCatalogue);
                            System.out.println("---------------" + bookName + "下载完成" + "--------------");
                        } catch (Exception e) {
                            return;
                        }
                    }
                });
                assert count>=2;
                Thread thread2 = new Thread(() -> {
                    for (int i = count/2; i < count; i++) {
                        try {
                            i++;
                            String bookCod = "0_" + i;
                            Document document = Jsoup.connect("https://www.biquge5200.com/" + bookCod + "/").get();
                            Element info = document.getElementById("info");
                            String bookName = info.select("h1").text();
                            String path = "/usr/local/webapps/file/" + bookName + ".txt";
                            File file = new File(path);
                            if (file.exists()) {
                                continue;
                            }
                            System.out.println("---------------" + bookName + "正在下载" + "--------------");
                            List<BookCatalogueDto> bookCatalogue = getBookCatalogue(bookCod, document, pattern);
                            downloadBook(bookCod, bookName, bookCatalogue);
                            System.out.println("---------------" + bookName + "下载完成" + "--------------");
                        } catch (Exception e) {
                            return;
                        }
                    }
                });
                thread1.start();
                thread2.start();
                try {
                    thread1.join();
                    thread2.join();
                    break;
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }
            }
        } else if ("biquge".equals(dataSource)) {
            while (true) {
                Thread thread1 = new Thread(() -> {
                    for (int j = 1; j < count/2; j++) {
                        try {
                            String bookCod = "0_" + j;
                            Document document = Jsoup.connect("https://www.biquge.com/" + bookCod + "/").get();
                            Element info = document.getElementById("info");
                            String bookName = info.select("h1").text();
                            String path = "/usr/local/webapps/file/" + bookName + ".txt";
                            File file = new File(path);
                            if (file.exists()) {
                                continue;
                            }
                            List<BookCatalogueDto> bookCatalogue = getBookCatalogue(bookCod, document, pattern);
                            System.out.println("---------------" + bookName + "正在下载" + "--------------");
                            downloadBook(bookCod, bookName, bookCatalogue);
                            System.out.println("---------------" + bookName + "下载完成" + "--------------");
                        } catch (Exception e) {
                            continue;
                        }
                    }
                });
                Thread thread2 = new Thread(() -> {
                    for (int j = count/2; j < count; j++) {
                        try {
                            String bookCod = "0_" + j;
                            Document document = Jsoup.connect("https://www.biquge.com/" + bookCod + "/").get();
                            Element info = document.getElementById("info");
                            String bookName = info.select("h1").text();
                            String path = "/usr/local/webapps/file/" + bookName + ".txt";
                            File file = new File(path);
                            if (file.exists()) {
                                continue;
                            }
                            List<BookCatalogueDto> bookCatalogue = getBookCatalogue(bookCod, document, pattern);
                            System.out.println("---------------" + bookName + "正在下载" + "--------------");
                            downloadBook(bookCod, bookName, bookCatalogue);
                            System.out.println("---------------" + bookName + "下载完成" + "--------------");
                        } catch (Exception e) {
                            continue;
                        }
                    }
                });
                thread1.start();
                thread2.start();
                try {
                    thread1.join();
                    thread2.join();
                    break;
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }
            }
 
        }
    }
 
    public static void downloadBook(String bookCod, String bookName, List<BookCatalogueDto> bookCatalogueDto) throws Exception {
        String path = "/usr/local/webapps/file/" + bookName + ".txt";
        File file = new File(path);
        if (file.exists()) {
            return;
        }
        Map<Integer, List<BookCatalogueDto>> integerListMap = splitList(bookCatalogueDto, 3);
        long start = System.currentTimeMillis();
        Thread thread1 = new Thread(() -> {
            try {
                if ("biquge5200".equals(dataSource)) {
                    biquge5200(bookCod, bookName + "1", integerListMap.get(0));
                } else if ("biquge".equals(dataSource)) {
                    biquge(bookCod, bookName + "1", integerListMap.get(0));
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        });
        Thread thread2 = new Thread(() -> {
            try {
                if ("biquge5200".equals(dataSource)) {
                    biquge5200(bookCod, bookName + "2", integerListMap.get(1));
                } else if ("biquge".equals(dataSource)) {
                    biquge(bookCod, bookName + "2", integerListMap.get(1));
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        });
        Thread thread3 = new Thread(() -> {
            try {
                if ("biquge5200".equals(dataSource)) {
                    biquge5200(bookCod, bookName + "3", integerListMap.get(2));
                } else if ("biquge".equals(dataSource)) {
                    biquge(bookCod, bookName + "3", integerListMap.get(2));
                }
 
            } catch (Exception e) {
                e.printStackTrace();
            }
        });
        thread1.start();
        thread2.start();
        thread3.start();
        thread1.join();
        thread2.join();
        thread3.join();
        //合并文件
        combine(bookName);
        long end = System.currentTimeMillis();
        System.out.println("本次下载共用时" + (end - start));
    }
 
    public static void biquge5200(String bookCod, String bookName, List<BookCatalogueDto> bookCatalogueDto) throws
            Exception {
        String path = "/usr/local/webapps/file/downloading/" + bookName + ".txt";
        File file = new File(path);
        if (!file.exists()) {
            File dir = new File(file.getParent());
            dir.mkdirs();
            try {
                file.createNewFile();
            } catch (IOException e) {
                e.printStackTrace();
            }
        } else {
            List<BookCatalogueDto> bookCatalogueDtos = txtCatalogue(bookName);
            if (bookCatalogueDtos.size() != 0) {
                BookCatalogueDto bookCatalogueDto1 = bookCatalogueDtos.get(bookCatalogueDtos.size() - 1);
                for (BookCatalogueDto catalogueDto : bookCatalogueDto) {
                    if (catalogueDto.getCatalogueName().equals(bookCatalogueDto1.getCatalogueName())) {
                        int i = bookCatalogueDto.indexOf(catalogueDto);
                        bookCatalogueDto = bookCatalogueDto.subList(i + 1, bookCatalogueDto.size());
                        break;
                    }
                }
            }
        }
 
        //创建一个输出流,将爬到的小说以txt形式保存在硬盘
        BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, true)));
        if(bookCatalogueDto.size()==0){
            return;
        }
        bookCatalogueDto.forEach(e -> {
 
            Document document = null;
            try {
                document = Jsoup.connect("https://www.biquge5200.com/" + bookCod + "/" + e.getCatalogueCod() + ".html").get();
            } catch (IOException ioException) {
                try {
                    Thread.sleep(5000);
                    try {
                        document = Jsoup.connect("https://www.biquge5200.com/" + bookCod + "/" + e.getCatalogueCod() + ".html").get();
                    } catch (IOException exception) {
                        return;
                    }
                } catch (InterruptedException interruptedException) {
                    interruptedException.printStackTrace();
                }
            }
            Elements chapterName = document.select("h1");
            try {
                bw.write(chapterName.text());
                bw.newLine();
                bw.flush();
            } catch (IOException ioException) {
                ioException.printStackTrace();
            }
            Elements elements = document.select("#content");
            String html = elements.get(0).html().replace("<div id='content'>", "").replace("</div>", "");
            String replace = html.replace("<script>readx();</script>", "").replace("<script>chaptererror();</script>", "");
            try {
                String[] split = replace.replace("<p>", "").split("</p>");
                for (String s : split) {
                    bw.write(s);
                    bw.newLine();
                    bw.flush();
                }
 
            } catch (IOException ioException) {
                ioException.printStackTrace();
            }
        });
        try {
            bw.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
 
    public static List<BookCatalogueDto> getBookCatalogue(String bookCod, Document document, Pattern pattern) throws InterruptedException {
        List<BookCatalogueDto> bookCatalogueDtos = new ArrayList<>();
        Elements dd = document.getElementsByTag("dd");
        Map<Integer, List<Element>> integerListMap = splitList(dd, 3);
        final List<BookCatalogueDto>[] bookCatalogueDtos1 = new List[]{new ArrayList<>()};
        final List<BookCatalogueDto>[] bookCatalogueDtos2 = new List[]{new ArrayList<>()};
        final List<BookCatalogueDto>[] bookCatalogueDtos3 = new List[]{new ArrayList<>()};
        Thread thread1 = new Thread(() -> {
            bookCatalogueDtos1[0] = get(integerListMap.get(0), bookCod, document, pattern);
        });
        Thread thread2 = new Thread(() -> {
            bookCatalogueDtos2[0] = get(integerListMap.get(1), bookCod, document, pattern);
        });
        Thread thread3 = new Thread(() -> {
            bookCatalogueDtos3[0] = get(integerListMap.get(2), bookCod, document, pattern);
        });
        thread1.start();
        thread2.start();
        thread3.start();
        thread1.join();
        thread2.join();
        thread3.join();
        bookCatalogueDtos.addAll(bookCatalogueDtos1[0]);
        bookCatalogueDtos.addAll(bookCatalogueDtos2[0]);
        bookCatalogueDtos.addAll(bookCatalogueDtos3[0]);
        return bookCatalogueDtos;
    }
 
    public static List<BookCatalogueDto> get(List<Element> dd, String bookCod, Document document, Pattern pattern) {
        List<BookCatalogueDto> bookCatalogueDtos = new ArrayList<>();
        for (int i = 0; i < dd.size(); i++) {
            Element element = dd.get(i);
            BookCatalogueDto bookCatalogueDto = new BookCatalogueDto();
            Node node = element.childNode(0);
            for (Node e : element.childNodes()) {
                if (!"".equals(e.toString())) {
                    node = e;
                }
            }
            String s1 = node.toString();
            Matcher matcher = pattern.matcher(s1);
            if (matcher.find()) {
                String nameCodeUrl = matcher.group(1);
                String insStr = nameCodeUrl.substring(nameCodeUrl.lastIndexOf("/") + 1, nameCodeUrl.lastIndexOf("."));
                bookCatalogueDto.setCatalogueCod(Integer.parseInt(insStr));
            }
            bookCatalogueDto.setBookCod(bookCod);
            bookCatalogueDto.setCatalogueName(element.text());
            bookCatalogueDtos.add(bookCatalogueDto);
        }
        return bookCatalogueDtos;
    }
 
    private static void biquge(String bookCod, String bookName, List<BookCatalogueDto> bookCatalogueDto) throws FileNotFoundException {
        String path = "/usr/local/webapps/file/downloading/" + bookName + ".txt";
        File file = new File(path);
        if (!file.exists()) {
            File dir = new File(file.getParent());
            dir.mkdirs();
            try {
                file.createNewFile();
            } catch (IOException e) {
                e.printStackTrace();
            }
        } else {
            List<BookCatalogueDto> bookCatalogueDtos = txtCatalogue(bookName);
            if (bookCatalogueDtos.size() != 0) {
                BookCatalogueDto bookCatalogueDto1 = bookCatalogueDtos.get(bookCatalogueDtos.size() - 1);
                for (BookCatalogueDto catalogueDto : bookCatalogueDto) {
                    if (catalogueDto.getCatalogueName().equals(bookCatalogueDto1.getCatalogueName())) {
                        int i = bookCatalogueDto.indexOf(catalogueDto);
                        bookCatalogueDto = bookCatalogueDto.subList(i + 1, bookCatalogueDto.size());
                        break;
                    }
                }
            }
        }
        //创建一个输出流,将爬到的小说以txt形式保存在硬盘
        BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, true)));
        if(bookCatalogueDto.size()==0){
            return;
        }
        bookCatalogueDto.forEach(e -> {
            Document document = null;
            try {
                document = Jsoup.connect("https://www.biquge.com/" + e.getBookCod() + "/" + e.getCatalogueCod() + ".html").get();
            } catch (Exception e1) {
                try {
                    Thread.sleep(5000);
                    document = Jsoup.connect("https://www.biquge.com/" + e.getBookCod() + "/" + e.getCatalogueCod() + ".html").get();
                } catch (InterruptedException interruptedException) {
                    interruptedException.printStackTrace();
                } catch (Exception exception) {
                    exception.printStackTrace();
                }
            }
 
            Elements chapterName = document.select("h1");
            try {
                bw.write(chapterName.text());
                bw.newLine();
                bw.flush();
            } catch (IOException ioException) {
                ioException.printStackTrace();
            }
            Elements elements = document.select("#content");
            String html = elements.get(0).html().replace("<div id='content'>", "").replace("</div>", "");
            String replace = html.replace("<script>readx();</script>", "").replace("<script>chaptererror();</script>", "");
            try {
                String[] split = replace.split("<br>");
                for (String s : split) {
                    bw.write(s);
                    bw.newLine();
                    bw.flush();
                }
 
            } catch (IOException ioException) {
                ioException.printStackTrace();
            }
        });
        try {
            bw.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
 
    public static <T> Map<Integer, List<T>> splitList(List<T> t, int num) {
        Map<Integer, List<T>> subList = new HashMap<>();
        int num1 = (int) Math.floor(t.size() / num);
        for (int i = 0; i < num; i++) {
            subList.put(i, t.subList(i * num1, (i + 1) * num1));
            if (i == num - 1) {
                subList.put(i, t.subList(i * num1, t.size()));
            }
        }
        return subList;
    }
 
    public static void combine(String bookName) throws Exception {
        String bookPath = "/usr/local/webapps/file/" + bookName + ".txt";
        File file = new File(bookPath);
        BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, true)));
        for (int i = 1; i < 4; i++) {
            String path = "/usr/local/webapps/file/downloading/" + bookName + i + ".txt";
            File file1 = new File(path);
            if (file1.exists()) {
                BufferedReader br = new BufferedReader(new FileReader(file1));
                String line;
                while (true) {
                    if (!((line = br.readLine()) != null)) {
                        br.close();
                        break;
                    }
                    bw.write(line);
                    bw.newLine();
                }
            }
            file1.delete();
        }
        bw.flush();
        bw.close();
    }
 
    public static List<BookCatalogueDto> txtCatalogue(String bookName) {
        List<BookCatalogueDto> bookCatalogueDtos = new ArrayList<>();
        String fileNamedirs = "/usr/local/webapps/file/downloading/" + bookName + ".txt";
        try {
            // 编码格式
            String encoding = "utf-8";
            // 文件路径
            File file = new File(fileNamedirs);
            if (file.isFile() && file.exists()) { // 判断文件是否存在
                // 输入流
                InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);// 考虑到编码格
                BufferedReader bufferedReader = new BufferedReader(read);
                String lineTxt = null;
                Long count = (long) 0;
                boolean bflag = false;
                int n = 0;
                String newStr = null;
                String titleName = null;
                String newChapterName = null;//新章节名称
                String substring = null;
                int indexOf = 0;
                int indexOf1 = 0;
                int line = 0;
                //小说内容类
                BookCatalogueDto content;
                while ((lineTxt = bufferedReader.readLine()) != null) {
                    content = new BookCatalogueDto();
                    //小说名称
                    content.setBookName(bookName);
                    count++;
                    // 正则表达式
                    Pattern p = Pattern.compile("(^\\s*第)(.{1,9})[章节卷集部篇回](\\s{1})(.*)($\\s*)");
                    Matcher matcher = p.matcher(lineTxt);
                    newStr = newStr + lineTxt;
                    while (matcher.find()) {
                        titleName = matcher.group();
                        //章节去空
                        newChapterName = titleName.trim();
                        //获取章节
                        //System.out.println(newChapterName);
                        content.setCatalogueName(newChapterName);
                        indexOf1 = indexOf;
                        //System.out.println(indexOf);
                        indexOf = newStr.indexOf(newChapterName);
                        // System.out.println(newChapterName + ":" + "第" + count + "行"); // 得到返回的章
                        if (bflag) {
                            bflag = false;
                            break;
                        }
                        if (n == 0) {
                            indexOf1 = newStr.indexOf(newChapterName);
                        }
                        n = 1;
                        bflag = true;
                        //System.out.println(chapter);
                        bookCatalogueDtos.add(content);
                    }
                }
                bufferedReader.close();
            } else {
                System.out.println("找不到指定的文件");
            }
        } catch (Exception e) {
            System.out.println("读取文件内容出错");
            e.printStackTrace();
        }
        return bookCatalogueDtos;
    }
}

来源:https://blog.csdn.net/yangsen6666/article/details/127011878