X-Git-Url: https://git.argeo.org/?a=blobdiff_plain;f=org.argeo.util%2Fsrc%2Forg%2Fargeo%2Futil%2FCsvParser.java;fp=org.argeo.util%2Fsrc%2Forg%2Fargeo%2Futil%2FCsvParser.java;h=b903f77226457bbab9ded55926899677e98d34f2;hb=9f729eeb8255a9d800ad2506735dda8cc215a135;hp=0000000000000000000000000000000000000000;hpb=f9efbe5228615951dd8482a4582aa24e00c10ce5;p=lgpl%2Fargeo-commons.git diff --git a/org.argeo.util/src/org/argeo/util/CsvParser.java b/org.argeo.util/src/org/argeo/util/CsvParser.java new file mode 100644 index 000000000..b903f7722 --- /dev/null +++ b/org.argeo.util/src/org/argeo/util/CsvParser.java @@ -0,0 +1,242 @@ +package org.argeo.util; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.UnsupportedEncodingException; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * Parses a CSV file interpreting the first line as a header. The + * {@link #parse(InputStream)} method and the setters are synchronized so that + * the object cannot be modified when parsing. + */ +public abstract class CsvParser { + private char separator = ','; + private char quote = '\"'; + + private Boolean noHeader = false; + private Boolean strictLineAsLongAsHeader = true; + + /** + * Actually process a parsed line. If + * {@link #setStrictLineAsLongAsHeader(Boolean)} is true (default) the header + * and the tokens are guaranteed to have the same size. + * + * @param lineNumber the current line number, starts at 1 (the header, if header + * processing is enabled, the first line otherwise) + * @param header the read-only header or null if + * {@link #setNoHeader(Boolean)} is true (default is false) + * @param tokens the parsed tokens + */ + protected abstract void processLine(Integer lineNumber, List header, List tokens); + + /** + * Parses the CSV file (stream is closed at the end) + * + * @param in the stream to parse + * + * @deprecated Use {@link #parse(InputStream, Charset)} instead. + */ + @Deprecated + public synchronized void parse(InputStream in) { + parse(in, (Charset) null); + } + + /** + * Parses the CSV file (stream is closed at the end) + * + * @param in the stream to parse + * @param encoding the encoding to use. + * + * @deprecated Use {@link #parse(InputStream, Charset)} instead. + */ + @Deprecated + public synchronized void parse(InputStream in, String encoding) { + Reader reader; + if (encoding == null) + reader = new InputStreamReader(in); + else + try { + reader = new InputStreamReader(in, encoding); + } catch (UnsupportedEncodingException e) { + throw new IllegalArgumentException(e); + } + parse(reader); + } + + /** + * Parses the CSV file (stream is closed at the end) + * + * @param in the stream to parse + * @param charset the charset to use + */ + public synchronized void parse(InputStream in, Charset charset) { + Reader reader; + if (charset == null) + reader = new InputStreamReader(in); + else + reader = new InputStreamReader(in, charset); + parse(reader); + } + + /** + * Parses the CSV file (stream is closed at the end) + * + * @param reader the reader to use (it will be buffered) + */ + public synchronized void parse(Reader reader) { + Integer lineCount = 0; + try (BufferedReader bufferedReader = new BufferedReader(reader)) { + List header = null; + if (!noHeader) { + String headerStr = bufferedReader.readLine(); + if (headerStr == null)// empty file + return; + lineCount++; + header = new ArrayList(); + StringBuffer currStr = new StringBuffer(""); + Boolean wasInquote = false; + while (parseLine(headerStr, header, currStr, wasInquote)) { + headerStr = bufferedReader.readLine(); + if (headerStr == null) + break; + wasInquote = true; + } + header = Collections.unmodifiableList(header); + } + + String line = null; + lines: while ((line = bufferedReader.readLine()) != null) { + line = preProcessLine(line); + if (line == null) { + // skip line + continue lines; + } + lineCount++; + List tokens = new ArrayList(); + StringBuffer currStr = new StringBuffer(""); + Boolean wasInquote = false; + sublines: while (parseLine(line, tokens, currStr, wasInquote)) { + line = bufferedReader.readLine(); + if (line == null) + break sublines; + wasInquote = true; + } + if (!noHeader && strictLineAsLongAsHeader) { + int headerSize = header.size(); + int tokenSize = tokens.size(); + if (tokenSize == 1 && line.trim().equals("")) + continue lines;// empty line + if (headerSize != tokenSize) { + throw new IllegalStateException("Token size " + tokenSize + " is different from header size " + + headerSize + " at line " + lineCount + ", line: " + line + ", header: " + header + + ", tokens: " + tokens); + } + } + processLine(lineCount, header, tokens); + } + } catch (IOException e) { + throw new RuntimeException("Cannot parse CSV file (line: " + lineCount + ")", e); + } + } + + /** + * Called before each (logical) line is processed, giving a change to modify it + * (typically for cleaning dirty files). To be overridden, return the line + * unchanged by default. Skip the line if 'null' is returned. + */ + protected String preProcessLine(String line) { + return line; + } + + /** + * Parses a line character by character for performance purpose + * + * @return whether to continue parsing this line + */ + protected Boolean parseLine(String str, List tokens, StringBuffer currStr, Boolean wasInquote) { + if (wasInquote) + currStr.append('\n'); + + char[] arr = str.toCharArray(); + boolean inQuote = wasInquote; + for (int i = 0; i < arr.length; i++) { + char c = arr[i]; + if (c == separator) { + if (!inQuote) { + tokens.add(currStr.toString()); +// currStr.delete(0, currStr.length()); + currStr.setLength(0); + currStr.trimToSize(); + } else { + // we don't remove separator that are in a quoted substring + // System.out + // .println("IN QUOTE, got a separator: [" + c + "]"); + currStr.append(c); + } + } else if (c == quote) { + if (inQuote && (i + 1) < arr.length && arr[i + 1] == quote) { + // case of double quote + currStr.append(quote); + i++; + } else {// standard + inQuote = inQuote ? false : true; + } + } else { + currStr.append(c); + } + } + + if (!inQuote) { + tokens.add(currStr.toString()); + // System.out.println("# TOKEN: " + currStr); + } + // if (inQuote) + // throw new ArgeoException("Missing quote at the end of the line " + // + str + " (parsed: " + tokens + ")"); + if (inQuote) + return true; + else + return false; + // return tokens; + } + + public char getSeparator() { + return separator; + } + + public synchronized void setSeparator(char separator) { + this.separator = separator; + } + + public char getQuote() { + return quote; + } + + public synchronized void setQuote(char quote) { + this.quote = quote; + } + + public Boolean getNoHeader() { + return noHeader; + } + + public synchronized void setNoHeader(Boolean noHeader) { + this.noHeader = noHeader; + } + + public Boolean getStrictLineAsLongAsHeader() { + return strictLineAsLongAsHeader; + } + + public synchronized void setStrictLineAsLongAsHeader(Boolean strictLineAsLongAsHeader) { + this.strictLineAsLongAsHeader = strictLineAsLongAsHeader; + } + +}