用户需求将数据每天自动导出到一个Excel,一天一个Sheet,超过30天之后则每导出一天的就删掉前面一天的。
简单地实现一下:通过XSSFWorkbook读Excel,写入当天的数据,根据Sheet数量决定是否删掉前面的Sheet。
需求的数据每天大概20w条左右,列较多。大概到第10个sheet之后,-xxs设置成16G都扛不住,发现在读入Excel时就已经OOM了。
知道POI确实比较占内存,但以前没概念,没想到一个上百M的Excel就会占用了十几G的内存。

解决写大Excel的问题

解决写的问题比较容易,使用SXSSFWorkbook代替XSSFWorkbook即可。SXSSFWorkbook写入时只保留一部分Rows在内存中,其他的都写入到硬盘中。
但是SXSSFWorkbook是不能直接读Excel的,要读取时还是要通过XSSFWorkbook。也就是说一旦打开还是要爆内存。
那么解决方案就有两种:

  1. 每次捞30天的数据,分页通过SXSSFWorkbook写入新Excel覆盖掉之前文件。
  2. 还是要想办法将之前那个Excel读出来,和当天数据汇总后通过SXSSFWorkbook写入新Excel覆盖掉之前文件。

虽然1看起来比较容易实现,但是查一个月的数据的话数据库那块的耗时就大了。
所以还是不能放弃2。

解决读大Excel的问题

读了一些POI的文档,使用SAX读取大Excel并将Excel的内容以文本方式搞到内存。
直接上代码:

  1. package com.csot.util;
  2. import java.io.File;
  3. import java.io.IOException;
  4. import java.io.InputStream;
  5. import java.util.ArrayList;
  6. import java.util.HashMap;
  7. import java.util.List;
  8. import javax.xml.parsers.ParserConfigurationException;
  9. import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
  10. import org.apache.poi.openxml4j.opc.OPCPackage;
  11. import org.apache.poi.openxml4j.opc.PackageAccess;
  12. import org.apache.poi.ss.usermodel.DataFormatter;
  13. import org.apache.poi.ss.util.CellAddress;
  14. import org.apache.poi.ss.util.CellReference;
  15. import org.apache.poi.util.SAXHelper;
  16. import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
  17. import org.apache.poi.xssf.eventusermodel.XSSFReader;
  18. import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
  19. import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
  20. import org.apache.poi.xssf.model.StylesTable;
  21. import org.apache.poi.xssf.usermodel.XSSFComment;
  22. import org.xml.sax.ContentHandler;
  23. import org.xml.sax.InputSource;
  24. import org.xml.sax.SAXException;
  25. import org.xml.sax.XMLReader;
  26. public class XlsxReader {
  27. private OPCPackage xlsxPackage;
  28. private StringBuilder stringBuilder;
  29. private List<String> sheetNames;
  30. private HashMap<String, StringBuilder> sheets = new HashMap<String, StringBuilder>(15);
  31. public void tryOpen(String path) throws Exception{
  32. File xlsxFile = new File(path);
  33. if (!xlsxFile.exists()) {
  34. System.err.println("Not found or not a file: " + xlsxFile.getPath());
  35. throw new Exception("File not found");
  36. }
  37. OPCPackage pack = OPCPackage.open(xlsxFile.getPath(), PackageAccess.READ);
  38. pack.close();
  39. }
  40. public void read(String path) throws Exception{
  41. this.sheetNames = new ArrayList<String>();
  42. this.sheets.clear();
  43. File xlsxFile = new File(path);
  44. if (!xlsxFile.exists()) {
  45. System.err.println("Not found or not a file: " + xlsxFile.getPath());
  46. throw new Exception("File not found");
  47. }
  48. this.xlsxPackage = OPCPackage.open(xlsxFile.getPath(), PackageAccess.READ);
  49. this.process();
  50. this.xlsxPackage.close();
  51. }
  52. public List<String> getSheetNames(){
  53. return this.sheetNames;
  54. }
  55. public HashMap<String, StringBuilder> getSheetContents(){
  56. return this.sheets;
  57. }
  58. public StringBuilder getSheetContent(String name){
  59. return this.sheets.get(name);
  60. }
  61. /**
  62. * Initiates the processing of the XLS workbook file to CSV.
  63. *
  64. * @throws IOException
  65. * @throws OpenXML4JException
  66. * @throws ParserConfigurationException
  67. * @throws SAXException
  68. */
  69. private void process()
  70. throws IOException, OpenXML4JException, ParserConfigurationException, SAXException {
  71. ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(this.xlsxPackage);
  72. XSSFReader xssfReader = new XSSFReader(this.xlsxPackage);
  73. StylesTable styles = xssfReader.getStylesTable();
  74. XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
  75. while (iter.hasNext()) {
  76. InputStream stream = iter.next();
  77. String sheetName = iter.getSheetName();
  78. this.stringBuilder = new StringBuilder();
  79. this.sheetNames.add(sheetName);
  80. processSheet(styles, strings, new SheetToCSV(), stream);
  81. this.sheets.put(sheetName, this.stringBuilder);
  82. stream.close();
  83. }
  84. }
  85. /**
  86. * Parses and shows the content of one sheet
  87. * using the specified styles and shared-strings tables.
  88. *
  89. * @param styles
  90. * @param strings
  91. * @param sheetInputStream
  92. */
  93. private void processSheet(
  94. StylesTable styles,
  95. ReadOnlySharedStringsTable strings,
  96. SheetContentsHandler sheetHandler,
  97. InputStream sheetInputStream)
  98. throws IOException, ParserConfigurationException, SAXException {
  99. DataFormatter formatter = new DataFormatter();
  100. InputSource sheetSource = new InputSource(sheetInputStream);
  101. try {
  102. XMLReader sheetParser = SAXHelper.newXMLReader();
  103. ContentHandler handler = new XSSFSheetXMLHandler(
  104. styles, null, strings, sheetHandler, formatter, false);
  105. sheetParser.setContentHandler(handler);
  106. sheetParser.parse(sheetSource);
  107. } catch (ParserConfigurationException e) {
  108. throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
  109. }
  110. }
  111. private class SheetToCSV implements SheetContentsHandler {
  112. private boolean firstCellOfRow = false;
  113. private int currentRow = -1;
  114. private int currentCol = -1;
  115. private void outputMissingRows(int number) {
  116. for (int i = 0; i < number; i++) {
  117. stringBuilder.append('\n');
  118. }
  119. }
  120. @Override
  121. public void startRow(int rowNum) {
  122. // If there were gaps, output the missing rows
  123. outputMissingRows(rowNum - currentRow - 1);
  124. // Prepare for this row
  125. firstCellOfRow = true;
  126. currentRow = rowNum;
  127. currentCol = -1;
  128. }
  129. @Override
  130. public void endRow(int rowNum) {
  131. stringBuilder.append('\n');
  132. }
  133. @Override
  134. public void cell(String cellReference, String formattedValue,
  135. XSSFComment comment) {
  136. if (firstCellOfRow) {
  137. firstCellOfRow = false;
  138. } else {
  139. stringBuilder.append(',');
  140. }
  141. // gracefully handle missing CellRef here in a similar way as XSSFCell does
  142. if (cellReference == null) {
  143. cellReference = new CellAddress(currentRow, currentCol).formatAsString();
  144. }
  145. // Did we miss any cells?
  146. int thisCol = (new CellReference(cellReference)).getCol();
  147. int missedCols = thisCol - currentCol - 1;
  148. for (int i = 0; i < missedCols; i++) {
  149. stringBuilder.append(',');
  150. }
  151. currentCol = thisCol;
  152. // Number or string?
  153. try {
  154. Double.parseDouble(formattedValue);
  155. stringBuilder.append(formattedValue);
  156. } catch (NumberFormatException e) {
  157. //stringBuilder.append('"');
  158. stringBuilder.append(formattedValue);
  159. //stringBuilder.append('"');
  160. }
  161. }
  162. @Override
  163. public void headerFooter(String text, boolean isHeader, String tagName) {
  164. }
  165. }
  166. }

通过这个Reader将Excel读到一个Map中,就只要几百兆内存了。

其他

其实写的时候突然想到,可以用一个临时表装这些数据,每天通过ETL把数据放到临时表,然后每次导出30天的数据。
但是这不重要了(逃)。