01_微信公众号_话题_提取链接_一行一个

发布时间 2023-04-04 19:10:03作者: 虎虎生威啊

nodejs cheerio 提取超链接

import { log } from "console";
import {
  readFileSync,
  readdirSync,
  lstatSync,
  createWriteStream,
  mkdirSync,
  statSync,
} from "fs";
import { basename, join, resolve } from "path";
import { load, CheerioAPI } from "cheerio";

const basePath = resolve(__dirname, "..");
const htmlPath = join(basePath, "html");
const outPath = join(htmlPath, "out");
try {
  exitsFolder(outPath);
} catch (e) {
  log(e);
}
// 读取文件夹,返回一个文件列表数组
const fileList = readdirSync(htmlPath);
const pureFileList = fileList.filter((file) => {
  return lstatSync(join(htmlPath, file)).isFile();
});

pureFileList.forEach((file) => {
  extractTopic(join(htmlPath, file));
});

function extractTopic(filePath: string) {
  const $: CheerioAPI = loadHtmlDom(filePath);
  const urlArr = extractLink($);
  const outFilePath = getOutFilePath(filePath);
  const writeStream = createWriteStream(outFilePath, "utf-8");
  urlArr.forEach((url) => {
    writeStream.write(url);
    writeStream.write("\n");
  });
  writeStream.end();
}

function loadHtmlDom(filePath: string): CheerioAPI {
  const htmlText = readFileSync(filePath, "utf-8");
  return load(htmlText);
}

function extractLink($: CheerioAPI) {
  const oLinkList = $(
    "#js_content_overlay > div.album.js_album_container.album-rich_media_area_primary_full > div > div.album__content.js_album_bd > ul li"
  );

  const linkArr: string[] = [];

  oLinkList.each((i, oLink) => {
    const url = $(oLink).attr("data-link");
    linkArr.push(url ? url : "");
  });

  return linkArr;
}

function exitsFolder(absPath: string) {
  try {
    statSync(absPath);
  } catch (e) {
    // 不存在文件夹,直接创建 {recursive: true} 这个配置项是配置自动创建多个文件夹
    mkdirSync(absPath, { recursive: true });
  }
}

function getCurDate() {
  const d_t = new Date();

  let year = d_t.getFullYear();
  let month = ("0" + (d_t.getMonth() + 1)).slice(-2);
  let day = ("0" + d_t.getDate()).slice(-2);
  let hour = d_t.getHours();
  let minute = d_t.getMinutes();

  // prints date & time in YYYY-MM-DD HH:MM:SS format
  return year + "年" + month + "月" + day + "日" + hour + "时" + minute + "分";
}

function getOutFilePath(filePath: string) {
  return join(
    outPath,
    getCurDate() + "_目录_" + basename(filePath).split(".")[0] + ".txt"
  );
}