java读取txt文件解决乱码问题

发布时间 2023-07-30 18:28:47作者: newbigapple

说明:由于txt文件有bom和不同的编码方式,导致导入数据时产生乱码,以下代码完美解决乱码问题。
参考他人代码,结合自己的业务加工完成,费了大半天功夫完成,希望对大家有点用处。
废话不多说,直接上代码:

 /**
     * 从txt文件流读取数据
     *
     * @param txtStream
     * @return
     * @throws IOException
     */
    public static List<String> readFromTxt(InputStream txtStream) throws IOException {
        List<String> paragraphList = new ArrayList<>();
        LabelValuePair<InputStream, Charset> result = getStreamCharset(txtStream);
        Charset cs = result.getValue();
        BOMInputStream bomInputStream = new BOMInputStream(result.getLabel());
        boolean hasBom = bomInputStream.hasBOM();
        InputStreamReader sr = hasBom ?
                new InputStreamReader(bomInputStream, Charset.forName(bomInputStream.getBOMCharsetName())) :
                new InputStreamReader(bomInputStream, cs);
        BufferedReader br = new BufferedReader(sr);
        String line = null;
        Integer lineIndex = 0;
        while ((line = br.readLine()) != null) {
            if (!hasBom && lineIndex == 0) {
                lineIndex++;
                if (StringUtils.isNotEmpty(line)) {
                    byte[] bts = line.getBytes(cs);
                    if ((bts[0] == -1 && bts[1] == -2) || bts[0] == -2 && bts[1] == -1) {
                        byte[] newBts = new byte[bts.length - 2];
                        for (int i = 2; i < bts.length; i++) {
                            newBts[i - 2] = bts[i];
                        }
                        line = new String(newBts, cs);
                    }
                }
            }
            if (StringUtils.isNotEmpty(line) && StringUtils.isNotEmpty(line.trim())) {
                paragraphList.add(line);
                log.info("读取数据:{},长度:{},value:{}", line, line.trim().length(), line.getBytes(cs));
            }
        }
        br.close();
        sr.close();
        return paragraphList;
    }

 /**
     * 判断获取字节流 编码格式,主要用于txt文件内容读取
     * 再次读取流,使用返回结果中的流
     *
     * @param stream
     * @return
     */
    public static LabelValuePair<InputStream, Charset> getStreamCharset(InputStream stream) throws IOException {
        LabelValuePair<InputStream, byte[]> result = readSteam(stream, true);
        byte[] buffer = result.getValue();
        if (buffer.length < 2)
            return new LabelValuePair<>(result.getLabel(), CharsetKit.CHARSET_GBK);
        String encode = getFileCharSet(new BufferedInputStream(new ByteArrayInputStream(result.getValue())));// getBytesCharset(buffer);

        return new LabelValuePair<>(result.getLabel(), CharsetKit.charset(encode));
    }

  /**
     * 判断txt编码格式方法
     *
     * @param bis
     * @return
     */
    public static String getFileCharSet(BufferedInputStream bis) {
        String charset = "GBK";
        byte[] first3Bytes = new byte[3];
        try {
            boolean checked = false;
            bis.mark(0);
            int read = bis.read(first3Bytes, 0, 3);
            if (read == -1) {
                return charset; //文件编码为 ANSI
            } else if (first3Bytes[0] == (byte) 0xFF
                    && first3Bytes[1] == (byte) 0xFE) {
                charset = "UTF-16LE"; //文件编码为 Unicode
                checked = true;
            } else if (first3Bytes[0] == (byte) 0xFE
                    && first3Bytes[1] == (byte) 0xFF) {
                charset = "UTF-16BE"; //文件编码为 Unicode big endian
                checked = true;
            } else if (first3Bytes[0] == (byte) 0xEF
                    && first3Bytes[1] == (byte) 0xBB
                    && first3Bytes[2] == (byte) 0xBF) {
                charset = "UTF-8"; //文件编码为 UTF-8
                checked = true;
            }
            bis.reset();
            if (!checked) {
                int loc = 0;
                while ((read = bis.read()) != -1) {
                    loc++;
                    if (read >= 0xF0)
                        break;
                    if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的,也算是GBK
                        break;
                    if (0xC0 <= read && read <= 0xDF) {
                        read = bis.read();
                        if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)
                            // (0x80
                            // - 0xBF),也可能在GB编码内
                            continue;
                        else
                            break;
                    } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错,但是几率较小
                        read = bis.read();
                        if (0x80 <= read && read <= 0xBF) {
                            read = bis.read();
                            if (0x80 <= read && read <= 0xBF) {
                                charset = "UTF-8";
                                break;
                            } else
                                break;
                        } else
                            break;
                    }
                }
            }
            bis.close();
        } catch (Exception e) {
            log.error("获取文件编码方式异常", e);
        }
        return charset;
    }

    /**
     * 读取流
     *
     * @param inputStream 输入流
     * @param isRepeat    是否重复读取
     * @return
     */
    public static LabelValuePair<InputStream, byte[]> readSteam(InputStream inputStream, boolean isRepeat) throws IOException {
        ByteArrayOutputStream outSteam = new ByteArrayOutputStream();
        byte[] buffer = new byte[1024];
        int len = -1;
        inputStream.mark(0);
        while ((len = inputStream.read(buffer)) != -1) {
            outSteam.write(buffer);
        }
        byte[] fs = outSteam.toByteArray();
        outSteam.close();
        inputStream.close();
        InputStream newSteam = null;
        if (isRepeat) {
            newSteam = new ByteArrayInputStream(fs);
        }

        return new LabelValuePair<>(newSteam, fs);
    }