From: Mathieu Baudier Date: Tue, 11 Jan 2011 21:48:40 +0000 (+0000) Subject: Introduce CsvParser X-Git-Tag: argeo-commons-2.1.30~1541 X-Git-Url: http://git.argeo.org/?a=commitdiff_plain;h=56472382695a908e322c711070116aa64ca53b85;p=lgpl%2Fargeo-commons.git Introduce CsvParser git-svn-id: https://svn.argeo.org/commons/trunk@4008 4cfe0d0a-d680-48aa-b62c-e0a02a3f76cc --- diff --git a/basic/runtime/org.argeo.basic.nodeps/src/main/java/org/argeo/util/CsvParser.java b/basic/runtime/org.argeo.basic.nodeps/src/main/java/org/argeo/util/CsvParser.java new file mode 100644 index 000000000..4b4d0c876 --- /dev/null +++ b/basic/runtime/org.argeo.basic.nodeps/src/main/java/org/argeo/util/CsvParser.java @@ -0,0 +1,157 @@ +package org.argeo.util; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.argeo.ArgeoException; + +/** + * Parses a CSV file interpreting the first line as a header. The + * {@link #parse(InputStream)} method and the setters are synchronized so that + * the object cannot be modified when parsing. + */ +public abstract class CsvParser { + private char separator = ','; + private char quote = '\"'; + + private Boolean noHeader = false; + private Boolean strictLineAsLongAsHeader = true; + + /** + * Actually process a parsed line. If + * {@link #setStrictLineAsLongAsHeader(Boolean)} is true (default) the + * header and the tokens are guaranteed to have the same size. + * + * @param lineNumber + * the current line number, starts at 1 (the header, if header + * processing is enabled, the first lien otherwise) + * @param header + * the read-only header or null if {@link #setNoHeader(Boolean)} + * is true (default is false) + * @param tokens + * the parse tokens + */ + protected abstract void processLine(Integer lineNumber, + List header, List tokens); + + public synchronized void parse(InputStream in) { + BufferedReader reader = null; + Integer lineCount = 0; + try { + reader = new BufferedReader(new InputStreamReader(in)); + + List header = null; + if (!noHeader) { + String headerStr = reader.readLine(); + if (headerStr == null)// empty file + return; + lineCount++; + header = Collections.unmodifiableList(parseLine(headerStr)); + } + + String line = null; + lines: while ((line = reader.readLine()) != null) { + lineCount++; + List tokens = parseLine(line); + if (!noHeader && strictLineAsLongAsHeader) { + int headerSize = header.size(); + int tokenSize = tokens.size(); + if (tokenSize == 1 && line.trim().equals("")) + continue lines;// empty line + if (headerSize != tokenSize) { + throw new ArgeoException("Token size " + tokenSize + + " is different from header size " + + headerSize + " at line " + lineCount + + ", line: " + line + ", header: " + header + + ", tokens: " + tokens); + } + } + processLine(lineCount, header, tokens); + } + } catch (ArgeoException e) { + throw e; + } catch (IOException e) { + throw new ArgeoException("Cannot parse CSV file (line: " + + lineCount + ")", e); + } finally { + if (reader != null) + try { + reader.close(); + } catch (Exception e2) { + // silent + } + } + } + + /** Parses a line character by character for performance purpose */ + protected List parseLine(String str) { + List tokens = new ArrayList(); + + char[] arr = str.toCharArray(); + boolean inQuote = false; + StringBuffer currStr = new StringBuffer(""); + for (int i = 0; i < arr.length; i++) { + char c = arr[i]; + if (c == separator) { + if (!inQuote) { + tokens.add(currStr.toString()); + currStr = new StringBuffer(""); + } + } else if (c == quote) { + if (inQuote && (i + 1) < arr.length && arr[i + 1] == quote) { + // case of double quote + currStr.append(quote); + i++; + } else {// standard + inQuote = inQuote ? false : true; + } + } else { + currStr.append(c); + } + } + tokens.add(currStr.toString()); + if (inQuote) + throw new ArgeoException("Missing quote at the end of the line " + + str + " (parsed: " + tokens + ")"); + return tokens; + } + + public char getSeparator() { + return separator; + } + + public synchronized void setSeparator(char separator) { + this.separator = separator; + } + + public char getQuote() { + return quote; + } + + public synchronized void setQuote(char quote) { + this.quote = quote; + } + + public Boolean getNoHeader() { + return noHeader; + } + + public synchronized void setNoHeader(Boolean noHeader) { + this.noHeader = noHeader; + } + + public Boolean getStrictLineAsLongAsHeader() { + return strictLineAsLongAsHeader; + } + + public synchronized void setStrictLineAsLongAsHeader( + Boolean strictLineAsLongAsHeader) { + this.strictLineAsLongAsHeader = strictLineAsLongAsHeader; + } + +} diff --git a/basic/runtime/org.argeo.basic.nodeps/src/main/java/org/argeo/util/CsvParserWithLinesAsMap.java b/basic/runtime/org.argeo.basic.nodeps/src/main/java/org/argeo/util/CsvParserWithLinesAsMap.java new file mode 100644 index 000000000..aa3198fc4 --- /dev/null +++ b/basic/runtime/org.argeo.basic.nodeps/src/main/java/org/argeo/util/CsvParserWithLinesAsMap.java @@ -0,0 +1,42 @@ +package org.argeo.util; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.argeo.ArgeoException; + +/** + * CSV parser allowing to process lines as maps whose keys are the header + * fields. + */ +public abstract class CsvParserWithLinesAsMap extends CsvParser { + + /** + * Actually processes a line. + * + * @param lineNumber + * the current line number, starts at 1 (the header, if header + * processing is enabled, the first lien otherwise) + * @param line + * the parsed tokens as a map whose keys are the header fields + */ + protected abstract void processLine(Integer lineNumber, + Map line); + + protected final void processLine(Integer lineNumber, List header, + List tokens) { + if (header == null) + throw new ArgeoException("Only CSV with header is supported"); + Map line = new HashMap(); + for (int i = 0; i < header.size(); i++) { + String key = header.get(i); + String value = null; + if (i < tokens.size()) + value = tokens.get(i); + line.put(key, value); + } + processLine(lineNumber, line); + } + +} diff --git a/basic/runtime/org.argeo.basic.nodeps/src/test/java/org/argeo/util/CsvParserTestCase.java b/basic/runtime/org.argeo.basic.nodeps/src/test/java/org/argeo/util/CsvParserTestCase.java new file mode 100644 index 000000000..0b17d67b5 --- /dev/null +++ b/basic/runtime/org.argeo.basic.nodeps/src/test/java/org/argeo/util/CsvParserTestCase.java @@ -0,0 +1,33 @@ +package org.argeo.util; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.util.List; + +import junit.framework.TestCase; + +public class CsvParserTestCase extends TestCase { + public void testParse() throws Exception { + String toParse = "Header1,\"Header2\",Header3,\"Header4\"\n" + + "Col1,\"Col2\",Col3,\"\"\"Col4\"\"\"\n" + + "Col1,\"Col2\",Col3,\"\"\"Col4\"\"\"\n" + + "Col1,\"Col2\",Col3,\"\"\"Col4\"\"\"\n"; + + InputStream in = new ByteArrayInputStream(toParse.getBytes()); + + CsvParser csvParser = new CsvParser() { + protected void processLine(Integer lineNumber, List header, + List tokens) { + assertEquals(header.size(), tokens.size()); + assertEquals(4, tokens.size()); + assertEquals("Col1", tokens.get(0)); + assertEquals("Col2", tokens.get(1)); + assertEquals("Col3", tokens.get(2)); + assertEquals("\"Col4\"", tokens.get(3)); + } + }; + + csvParser.parse(in); + in.close(); + } +}