Introduce CsvParser
authorMathieu Baudier <mbaudier@argeo.org>
Tue, 11 Jan 2011 21:48:40 +0000 (21:48 +0000)
committerMathieu Baudier <mbaudier@argeo.org>
Tue, 11 Jan 2011 21:48:40 +0000 (21:48 +0000)
git-svn-id: https://svn.argeo.org/commons/trunk@4008 4cfe0d0a-d680-48aa-b62c-e0a02a3f76cc

basic/runtime/org.argeo.basic.nodeps/src/main/java/org/argeo/util/CsvParser.java [new file with mode: 0644]
basic/runtime/org.argeo.basic.nodeps/src/main/java/org/argeo/util/CsvParserWithLinesAsMap.java [new file with mode: 0644]
basic/runtime/org.argeo.basic.nodeps/src/test/java/org/argeo/util/CsvParserTestCase.java [new file with mode: 0644]

diff --git a/basic/runtime/org.argeo.basic.nodeps/src/main/java/org/argeo/util/CsvParser.java b/basic/runtime/org.argeo.basic.nodeps/src/main/java/org/argeo/util/CsvParser.java
new file mode 100644 (file)
index 0000000..4b4d0c8
--- /dev/null
@@ -0,0 +1,157 @@
+package org.argeo.util;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.argeo.ArgeoException;
+
+/**
+ * Parses a CSV file interpreting the first line as a header. The
+ * {@link #parse(InputStream)} method and the setters are synchronized so that
+ * the object cannot be modified when parsing.
+ */
+public abstract class CsvParser {
+       private char separator = ',';
+       private char quote = '\"';
+
+       private Boolean noHeader = false;
+       private Boolean strictLineAsLongAsHeader = true;
+
+       /**
+        * Actually process a parsed line. If
+        * {@link #setStrictLineAsLongAsHeader(Boolean)} is true (default) the
+        * header and the tokens are guaranteed to have the same size.
+        * 
+        * @param lineNumber
+        *            the current line number, starts at 1 (the header, if header
+        *            processing is enabled, the first lien otherwise)
+        * @param header
+        *            the read-only header or null if {@link #setNoHeader(Boolean)}
+        *            is true (default is false)
+        * @param tokens
+        *            the parse tokens
+        */
+       protected abstract void processLine(Integer lineNumber,
+                       List<String> header, List<String> tokens);
+
+       public synchronized void parse(InputStream in) {
+               BufferedReader reader = null;
+               Integer lineCount = 0;
+               try {
+                       reader = new BufferedReader(new InputStreamReader(in));
+
+                       List<String> header = null;
+                       if (!noHeader) {
+                               String headerStr = reader.readLine();
+                               if (headerStr == null)// empty file
+                                       return;
+                               lineCount++;
+                               header = Collections.unmodifiableList(parseLine(headerStr));
+                       }
+
+                       String line = null;
+                       lines: while ((line = reader.readLine()) != null) {
+                               lineCount++;
+                               List<String> tokens = parseLine(line);
+                               if (!noHeader && strictLineAsLongAsHeader) {
+                                       int headerSize = header.size();
+                                       int tokenSize = tokens.size();
+                                       if (tokenSize == 1 && line.trim().equals(""))
+                                               continue lines;// empty line
+                                       if (headerSize != tokenSize) {
+                                               throw new ArgeoException("Token size " + tokenSize
+                                                               + " is different from header size "
+                                                               + headerSize + " at line " + lineCount
+                                                               + ", line: " + line + ", header: " + header
+                                                               + ", tokens: " + tokens);
+                                       }
+                               }
+                               processLine(lineCount, header, tokens);
+                       }
+               } catch (ArgeoException e) {
+                       throw e;
+               } catch (IOException e) {
+                       throw new ArgeoException("Cannot parse CSV file (line: "
+                                       + lineCount + ")", e);
+               } finally {
+                       if (reader != null)
+                               try {
+                                       reader.close();
+                               } catch (Exception e2) {
+                                       // silent
+                               }
+               }
+       }
+
+       /** Parses a line character by character for performance purpose */
+       protected List<String> parseLine(String str) {
+               List<String> tokens = new ArrayList<String>();
+
+               char[] arr = str.toCharArray();
+               boolean inQuote = false;
+               StringBuffer currStr = new StringBuffer("");
+               for (int i = 0; i < arr.length; i++) {
+                       char c = arr[i];
+                       if (c == separator) {
+                               if (!inQuote) {
+                                       tokens.add(currStr.toString());
+                                       currStr = new StringBuffer("");
+                               }
+                       } else if (c == quote) {
+                               if (inQuote && (i + 1) < arr.length && arr[i + 1] == quote) {
+                                       // case of double quote
+                                       currStr.append(quote);
+                                       i++;
+                               } else {// standard
+                                       inQuote = inQuote ? false : true;
+                               }
+                       } else {
+                               currStr.append(c);
+                       }
+               }
+               tokens.add(currStr.toString());
+               if (inQuote)
+                       throw new ArgeoException("Missing quote at the end of the line "
+                                       + str + " (parsed: " + tokens + ")");
+               return tokens;
+       }
+
+       public char getSeparator() {
+               return separator;
+       }
+
+       public synchronized void setSeparator(char separator) {
+               this.separator = separator;
+       }
+
+       public char getQuote() {
+               return quote;
+       }
+
+       public synchronized void setQuote(char quote) {
+               this.quote = quote;
+       }
+
+       public Boolean getNoHeader() {
+               return noHeader;
+       }
+
+       public synchronized void setNoHeader(Boolean noHeader) {
+               this.noHeader = noHeader;
+       }
+
+       public Boolean getStrictLineAsLongAsHeader() {
+               return strictLineAsLongAsHeader;
+       }
+
+       public synchronized void setStrictLineAsLongAsHeader(
+                       Boolean strictLineAsLongAsHeader) {
+               this.strictLineAsLongAsHeader = strictLineAsLongAsHeader;
+       }
+
+}
diff --git a/basic/runtime/org.argeo.basic.nodeps/src/main/java/org/argeo/util/CsvParserWithLinesAsMap.java b/basic/runtime/org.argeo.basic.nodeps/src/main/java/org/argeo/util/CsvParserWithLinesAsMap.java
new file mode 100644 (file)
index 0000000..aa3198f
--- /dev/null
@@ -0,0 +1,42 @@
+package org.argeo.util;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.argeo.ArgeoException;
+
+/**
+ * CSV parser allowing to process lines as maps whose keys are the header
+ * fields.
+ */
+public abstract class CsvParserWithLinesAsMap extends CsvParser {
+
+       /**
+        * Actually processes a line.
+        * 
+        * @param lineNumber
+        *            the current line number, starts at 1 (the header, if header
+        *            processing is enabled, the first lien otherwise)
+        * @param line
+        *            the parsed tokens as a map whose keys are the header fields
+        */
+       protected abstract void processLine(Integer lineNumber,
+                       Map<String, String> line);
+
+       protected final void processLine(Integer lineNumber, List<String> header,
+                       List<String> tokens) {
+               if (header == null)
+                       throw new ArgeoException("Only CSV with header is supported");
+               Map<String, String> line = new HashMap<String, String>();
+               for (int i = 0; i < header.size(); i++) {
+                       String key = header.get(i);
+                       String value = null;
+                       if (i < tokens.size())
+                               value = tokens.get(i);
+                       line.put(key, value);
+               }
+               processLine(lineNumber, line);
+       }
+
+}
diff --git a/basic/runtime/org.argeo.basic.nodeps/src/test/java/org/argeo/util/CsvParserTestCase.java b/basic/runtime/org.argeo.basic.nodeps/src/test/java/org/argeo/util/CsvParserTestCase.java
new file mode 100644 (file)
index 0000000..0b17d67
--- /dev/null
@@ -0,0 +1,33 @@
+package org.argeo.util;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.util.List;
+
+import junit.framework.TestCase;
+
+public class CsvParserTestCase extends TestCase {
+       public void testParse() throws Exception {
+               String toParse = "Header1,\"Header2\",Header3,\"Header4\"\n"
+                               + "Col1,\"Col2\",Col3,\"\"\"Col4\"\"\"\n"
+                               + "Col1,\"Col2\",Col3,\"\"\"Col4\"\"\"\n"
+                               + "Col1,\"Col2\",Col3,\"\"\"Col4\"\"\"\n";
+
+               InputStream in = new ByteArrayInputStream(toParse.getBytes());
+
+               CsvParser csvParser = new CsvParser() {
+                       protected void processLine(Integer lineNumber, List<String> header,
+                                       List<String> tokens) {
+                               assertEquals(header.size(), tokens.size());
+                               assertEquals(4, tokens.size());
+                               assertEquals("Col1", tokens.get(0));
+                               assertEquals("Col2", tokens.get(1));
+                               assertEquals("Col3", tokens.get(2));
+                               assertEquals("\"Col4\"", tokens.get(3));
+                       }
+               };
+
+               csvParser.parse(in);
+               in.close();
+       }
+}