cheerio_妙高峰上_八曲仙人_诗歌和讲记

发布时间 2023-04-16 21:49:34作者: 虎虎生威啊

cheerio 关键字过滤 关键字替换 内容剔除 八曲仙人之歌讲解

import { log } from "console";
import {
  readFileSync,
  readdirSync,
  lstatSync,
  createWriteStream,
  mkdirSync,
  statSync,
} from "fs";
import { basename, extname, join, resolve } from "path";
import { load, CheerioAPI, Cheerio } from "cheerio";

// 这个process.cwd()就是当前执行程序的文件夹
// const basePath = process.cwd();

const basePath = "E:\\公众号文章采集\\公众号HTML\\妙高峰上";
const outPath = join(basePath, "out");

try {
  exitsFolder(outPath);
} catch (e) {
  log(e);
}

// 读取文件夹,返回一个文件列表数组
const fileList = readdirSync(basePath);

const pureFilePathList = fileList
  .filter((fileName) => {
    return lstatSync(join(basePath, fileName)).isFile();
  })
  .filter((fileName) => {
    const fileExt = extname(fileName);
    return fileExt === ".html";
  })
  .map((fileName) => {
    return join(basePath, fileName);
  });

// pureFilePathList.forEach((filePath) => {
//   extractTopic(filePath);
// });

for (let filePath of pureFilePathList) {
  log(filePath);
  const $: CheerioAPI = loadHtmlDom(filePath);
  filterDom($);
  const outFilePath = getOutFilePath(filePath);
  const writeStream = createWriteStream(outFilePath, "utf-8");
  writeStream.write($("html").html());
  writeStream.end();
  break;
}

// ====================================================================================================================
// 移除标签的原则,尽量少移除p标签,有可能会误把正文内容移除
function filterDom($: CheerioAPI) {
  // 话题标签
  $("#js_tags").remove();
  // 包含特定文字的span标签
  $("span:contains('↓↓↓ 请点击左下角“阅读原文”')").remove();

  // style="white-space: normal;text-align: center;"
  // $(
  //   "p[style*='white-space: normal;text-align: center;']:contains('我是那')"
  // ).remove();

  // 室利·尼萨迦达塔·马哈拉吉的开示录
  // $(
  //   "p[style*='white-space: normal;text-align: center;']:contains('室利·尼萨迦达塔·马哈拉吉的开示录')"
  // ).remove();

  // 文字颜色是 color: rgb(136, 136, 136)   ,且包含" 室利·尼萨迦达塔·马哈拉吉 著"的span标签
  // $(
  //   "span[style*='color: rgb(136, 136, 136)']:contains('室利·尼萨迦达塔·马哈拉吉 著')"
  // ).remove();

  // 红色的span和strong标签
  // $("span[style*='color: rgb(255, 76, 65)']").remove();
  // $("strong[style*='color: rgb(255, 76, 65)']").remove();

  // 类名是comment的div标签
  $("div.comment").remove();

  // data-id="85560"
  $("section[data-id='85560']").remove();

  // 删除从当前元素的所有元素,包括自己========================================================================
  const delList: Cheerio<any>[] = [];

  // data-id="89227"
  let objCache = $("section[data-id='89227']");

  while (objCache.next().length !== 0) {
    delList.push(objCache);
    objCache = objCache.next();
  }
  if (delList.length === 0) return;
  delList.forEach((item) => {
    item.remove();
  });
  // 移除正文之后的内容,比如说查看全文========================================================================
  const delList2: Cheerio<any>[] = [];
  let objCache2 = $("#js_article");
  while (objCache2.next().length !== 0) {
    delList2.push(objCache2.next());
    objCache2 = objCache2.next();
  }
  if (delList2.length === 0) return;
  delList2.forEach((item) => {
    item.remove();
  });
  // 获取一个元素的所有兄弟元素,并删除=======================================================================
  let curObj = $("span:contains('(题图:拉玛那')");
  $("span:contains('(题图:拉玛那')").siblings().remove(); //先是删除所有的兄弟元素
  curObj.remove(); //然后是删除自己
  // ========================================
  // p标签的题图
  $("p:contains('题图:')").remove();

  // ================================================================================
  const rawEle = $(
    "p[style='margin-bottom: -1px; padding-right: 5px; padding-bottom: 6px; padding-left: 5px; border-bottom-width: 2px; border-bottom-style: solid; border-bottom-color: rgb(172, 29, 16); display: inline-block; line-height: 1.1; font-size: 18px;']"
  );
  const rawText = rawEle.text();
  const newText = rawText + "哈哈";
  rawEle.text(newText);
}

function loadHtmlDom(filePath: string): CheerioAPI {
  const htmlText = readFileSync(filePath, "utf-8");
  return load(htmlText);
}

function extractLink($: CheerioAPI) {
  const oLinkList = $("#js_articles > div");
  if (!oLinkList.length) return [];
  const linkArr: string[] = [];
  oLinkList.each((i, oLink) => {
    const url = $(oLink).attr("data-jump_url");
    if (!url) return;
    linkArr.push(url);
  });

  return linkArr;
}

function exitsFolder(absPath: string) {
  try {
    statSync(absPath);
  } catch (e) {
    // 不存在文件夹,直接创建 {recursive: true} 这个配置项是配置自动创建多个文件夹
    mkdirSync(absPath, { recursive: true });
  }
}

function getCurDate() {
  const d_t = new Date();

  let year = d_t.getFullYear();
  let month = ("0" + (d_t.getMonth() + 1)).slice(-2);
  let day = ("0" + d_t.getDate()).slice(-2);
  let hour = d_t.getHours();
  let minute = d_t.getMinutes();
  let second = d_t.getSeconds();

  // prints date & time in YYYY-MM-DD HH:MM:SS format
  return (
    year +
    "年" +
    month +
    "月" +
    day +
    "日" +
    hour +
    "时" +
    minute +
    "分" +
    second +
    "秒"
  );
}

function getOutFilePath(filePath: string) {
  return join(outPath, basename(filePath));
}