Java解析xml大文件

admin 发布于：2023-02-18 10:30:36

阅读：loading

基本介绍

曾经使用Java解析xml文件的实现方式和优缺点还是经典的面试题目，要是没有经历过或者并不认可的说明还是很年前的。日常工作中常见的配置文件类型有：xml、yml、properties、ini、json等格式，一些常用简单的配置可能首选将不再是xml，但xml格式也终将不会被抛弃，毕竟有许多适合的应用场景与其它格式无法具备的优点。

关于xml格式文件的解析，记忆中常用的有dom、jdom、jom4j、sax和JDK6中增加的Jaxb组件，对于xml大文件的解析除了它们自身实现的差异外，也主要取决于对各个API的把控，这几年来对于xml文件的解析一直使用的是Jaxb，话说还真的是我觉得最便捷好易用的，只不过在去年遇到了一个50M的以上大小的xml文件，在文件稍大的情况下，首先联想到的便是肯定不能一次性读取文件，再去使用xml解析，也就是说上述的所有组件均不能直接使用，也在同事超大佬的推荐下使用Guava的Files工具类Files.asCharSource(file, charset).readLines(callback)来将文件分段，取每个分段内的文件内容组合为一个批次的xml内容段去解析，最终循环所有内容段，避免一次性读取文件造成服务器的内存压力。所以，重点就在于使用Guava的Files工具类，使用缓存流的形式逐行读取文件内容。类似的工具类Apache Commons IO里FileUtils也提供了按行读取文件的lineIterator（注意不是readLines）实现，底层使用到了IOUtils，最终的实现是使用缓冲流读取文件，将文件内容按行读取，与本文所推荐的Guava的Files中也有readLines函数（高版本已标记过期）极为类似，二者都是采用生产者消费者模式读取的文件，不过这里我还是推荐使用Guava Fiels的asCharSource函数，它的底层实现支持LineProcessor（行处理器），我们在应用时可以一边生产一边消费，也可以生产一批后批量消费，同时它读取一行后等待消费再读取下一行，读取到的数据使用队列缓存存储，前者使用String缓存行，两者对于读取到的数据处理上也存在区别，前者使用行的内容作为返回值进行处理，后者使用函数式接口进行处理。

本次示例分为两个，1.使用代码生产150M的xml文件，该xml文件中的内容是有规则的，按某行的开始和某行的关键字来将内容分段写入；2.按特殊的行的开始标记和结束标记来分段读取文件，将读取到的内容打印至控制台（打印至控制台模拟为数据的最终处理）至于示例1文件的生成则比较简单，给出一个文件结构的示例看看即可，参考如下图所示：

（示例文件格式，实际文件按照1行标题行和200万零2条数据演示）

参考代码

定义数据消费者

package cn.chendd.xml;

import java.util.List;

/**
 * 按行读取文件内容的批量处理
 *
 * @author chendd
 * @date 2023/2/18 8:51
 */
@FunctionalInterface
public interface RowBatchListProcessor {

    /**
     * 批量处理数据
     * @param rows 文本数据
     */
    void execute(List<String> rows);

}

按行批量读取

package cn.chendd.xml;

import com.google.common.io.LineProcessor;

import java.util.List;

/**
 * 行解析实现
 *
 * @author chendd
 * @date 2023/2/18 8:41
 */
public class RowLineProcessor implements LineProcessor<String> {

    /**
     * xml文件内容的行节点个数
     */
    private static final int BATCH_SIZE = 200;

    private List<String> rows;
    private String beginMarker;
    private String endMarker;
    private RowBatchListProcessor processor;

    /**
     * 构造函数
     * @param rows 文件行数据
     * @param beginMarker 开始行标记
     * @param endMarker 结束行标记
     * @param processor 逻辑处理类
     */
    public RowLineProcessor(List<String> rows , String beginMarker , String endMarker , RowBatchListProcessor processor) {
        this.rows = rows;
        this.beginMarker = beginMarker;
        this.endMarker = endMarker;
        this.processor = processor;
    }

    /**
     * 单次获取的内容
     */
    private StringBuilder textBuilder = new StringBuilder();
    /**
     * 是否开始读取文件
     */
    private boolean begin = false;

    @Override
    public boolean processLine(String line) {
        if (line.endsWith(beginMarker)) {
            begin = true;
        }
        if (line.endsWith(endMarker)) {
            begin = false;
            textBuilder.append(line);
            rows.add(textBuilder.toString());
            textBuilder.setLength(0);
        } else if (begin) {
            textBuilder.append(line);
        }
        if (rows.size() > 0 && rows.size() % BATCH_SIZE == 0) {
            processor.execute(rows);
            rows.clear();
        }
        return true;
    }

    @Override
    public String getResult() {
        if (rows.isEmpty()) {
            return null;
        }
        this.processor.execute(rows);
        return null;
    }

}

调用示例

package cn.chendd.xml;

import com.google.common.collect.Lists;
import com.google.common.io.Files;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;

import java.io.File;
import java.io.IOException;
import java.net.URLDecoder;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * 文件按行读取验证
 *
 * @author chendd
 * @date 2023/2/18 9:59
 */
@RunWith(JUnit4.class)
public class RowLineReaderTest {

    @Test
    public void reader() throws IOException {
        AtomicInteger atomicInteger = new AtomicInteger();
        //批量数据解析实现
        RowBatchListProcessor execute = rows -> {
            System.out.println(String.format("第 %d 批数据处理，数据 %d 行！" , atomicInteger.addAndGet(1) , rows.size()));
        };
        RowLineProcessor processor = new RowLineProcessor(Lists.newArrayList() , "<tr>" , "</tr>" , execute);
        Files.asCharSource(this.getFile(), Charset.defaultCharset()).readLines(processor);
    }

    private File getFile() throws IOException {
        String fileFolder = URLDecoder.decode(getClass().getResource("").getFile() , StandardCharsets.UTF_8.name());
        return new File(fileFolder , "data.xml");
    }


}

示例输出