用户需求将数据每天自动导出到一个Excel,一天一个Sheet,超过30天之后则每导出一天的就删掉前面一天的。
简单地实现一下:通过XSSFWorkbook读Excel,写入当天的数据,根据Sheet数量决定是否删掉前面的Sheet。
需求的数据每天大概20w条左右,列较多。大概到第10个sheet之后,-xxs设置成16G都扛不住,发现在读入Excel时就已经OOM了。
知道POI确实比较占内存,但以前没概念,没想到一个上百M的Excel就会占用了十几G的内存。
解决写大Excel的问题
解决写的问题比较容易,使用SXSSFWorkbook代替XSSFWorkbook即可。SXSSFWorkbook写入时只保留一部分Rows在内存中,其他的都写入到硬盘中。
但是SXSSFWorkbook是不能直接读Excel的,要读取时还是要通过XSSFWorkbook。也就是说一旦打开还是要爆内存。
那么解决方案就有两种:
- 每次捞30天的数据,分页通过SXSSFWorkbook写入新Excel覆盖掉之前文件。
- 还是要想办法将之前那个Excel读出来,和当天数据汇总后通过SXSSFWorkbook写入新Excel覆盖掉之前文件。
虽然1看起来比较容易实现,但是查一个月的数据的话数据库那块的耗时就大了。
所以还是不能放弃2。
解决读大Excel的问题
读了一些POI的文档,使用SAX读取大Excel并将Excel的内容以文本方式搞到内存。
直接上代码:
package com.csot.util;import java.io.File;import java.io.IOException;import java.io.InputStream;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import javax.xml.parsers.ParserConfigurationException;import org.apache.poi.openxml4j.exceptions.OpenXML4JException;import org.apache.poi.openxml4j.opc.OPCPackage;import org.apache.poi.openxml4j.opc.PackageAccess;import org.apache.poi.ss.usermodel.DataFormatter;import org.apache.poi.ss.util.CellAddress;import org.apache.poi.ss.util.CellReference;import org.apache.poi.util.SAXHelper;import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;import org.apache.poi.xssf.eventusermodel.XSSFReader;import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;import org.apache.poi.xssf.model.StylesTable;import org.apache.poi.xssf.usermodel.XSSFComment;import org.xml.sax.ContentHandler;import org.xml.sax.InputSource;import org.xml.sax.SAXException;import org.xml.sax.XMLReader;public class XlsxReader {private OPCPackage xlsxPackage;private StringBuilder stringBuilder;private List<String> sheetNames;private HashMap<String, StringBuilder> sheets = new HashMap<String, StringBuilder>(15);public void tryOpen(String path) throws Exception{File xlsxFile = new File(path);if (!xlsxFile.exists()) {System.err.println("Not found or not a file: " + xlsxFile.getPath());throw new Exception("File not found");}OPCPackage pack = OPCPackage.open(xlsxFile.getPath(), PackageAccess.READ);pack.close();}public void read(String path) throws Exception{this.sheetNames = new ArrayList<String>();this.sheets.clear();File xlsxFile = new File(path);if (!xlsxFile.exists()) {System.err.println("Not found or not a file: " + xlsxFile.getPath());throw new Exception("File not found");}this.xlsxPackage = OPCPackage.open(xlsxFile.getPath(), PackageAccess.READ);this.process();this.xlsxPackage.close();}public List<String> getSheetNames(){return this.sheetNames;}public HashMap<String, StringBuilder> getSheetContents(){return this.sheets;}public StringBuilder getSheetContent(String name){return this.sheets.get(name);}/*** Initiates the processing of the XLS workbook file to CSV.** @throws IOException* @throws OpenXML4JException* @throws ParserConfigurationException* @throws SAXException*/private void process()throws IOException, OpenXML4JException, ParserConfigurationException, SAXException {ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(this.xlsxPackage);XSSFReader xssfReader = new XSSFReader(this.xlsxPackage);StylesTable styles = xssfReader.getStylesTable();XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();while (iter.hasNext()) {InputStream stream = iter.next();String sheetName = iter.getSheetName();this.stringBuilder = new StringBuilder();this.sheetNames.add(sheetName);processSheet(styles, strings, new SheetToCSV(), stream);this.sheets.put(sheetName, this.stringBuilder);stream.close();}}/*** Parses and shows the content of one sheet* using the specified styles and shared-strings tables.** @param styles* @param strings* @param sheetInputStream*/private void processSheet(StylesTable styles,ReadOnlySharedStringsTable strings,SheetContentsHandler sheetHandler,InputStream sheetInputStream)throws IOException, ParserConfigurationException, SAXException {DataFormatter formatter = new DataFormatter();InputSource sheetSource = new InputSource(sheetInputStream);try {XMLReader sheetParser = SAXHelper.newXMLReader();ContentHandler handler = new XSSFSheetXMLHandler(styles, null, strings, sheetHandler, formatter, false);sheetParser.setContentHandler(handler);sheetParser.parse(sheetSource);} catch (ParserConfigurationException e) {throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());}}private class SheetToCSV implements SheetContentsHandler {private boolean firstCellOfRow = false;private int currentRow = -1;private int currentCol = -1;private void outputMissingRows(int number) {for (int i = 0; i < number; i++) {stringBuilder.append('\n');}}@Overridepublic void startRow(int rowNum) {// If there were gaps, output the missing rowsoutputMissingRows(rowNum - currentRow - 1);// Prepare for this rowfirstCellOfRow = true;currentRow = rowNum;currentCol = -1;}@Overridepublic void endRow(int rowNum) {stringBuilder.append('\n');}@Overridepublic void cell(String cellReference, String formattedValue,XSSFComment comment) {if (firstCellOfRow) {firstCellOfRow = false;} else {stringBuilder.append(',');}// gracefully handle missing CellRef here in a similar way as XSSFCell doesif (cellReference == null) {cellReference = new CellAddress(currentRow, currentCol).formatAsString();}// Did we miss any cells?int thisCol = (new CellReference(cellReference)).getCol();int missedCols = thisCol - currentCol - 1;for (int i = 0; i < missedCols; i++) {stringBuilder.append(',');}currentCol = thisCol;// Number or string?try {Double.parseDouble(formattedValue);stringBuilder.append(formattedValue);} catch (NumberFormatException e) {//stringBuilder.append('"');stringBuilder.append(formattedValue);//stringBuilder.append('"');}}@Overridepublic void headerFooter(String text, boolean isHeader, String tagName) {}}}
通过这个Reader将Excel读到一个Map
其他
其实写的时候突然想到,可以用一个临时表装这些数据,每天通过ETL把数据放到临时表,然后每次导出30天的数据。
但是这不重要了(逃)。
