]> git.argeo.org Git - lgpl/argeo-commons.git/blobdiff - basic/runtime/org.argeo.basic.nodeps/src/main/java/org/argeo/util/CsvParser.java
Update license headers
[lgpl/argeo-commons.git] / basic / runtime / org.argeo.basic.nodeps / src / main / java / org / argeo / util / CsvParser.java
index 4b4d0c8762016320fc8054aa281f2993a24506ff..127d0f50928f9b358c5f376a9ba8704f9d9ba697 100644 (file)
@@ -1,3 +1,18 @@
+/*
+ * Copyright (C) 2007-2012 Mathieu Baudier
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.argeo.util;
 
 import java.io.BufferedReader;
@@ -29,35 +44,63 @@ public abstract class CsvParser {
         * 
         * @param lineNumber
         *            the current line number, starts at 1 (the header, if header
-        *            processing is enabled, the first lien otherwise)
+        *            processing is enabled, the first line otherwise)
         * @param header
         *            the read-only header or null if {@link #setNoHeader(Boolean)}
         *            is true (default is false)
         * @param tokens
-        *            the parse tokens
+        *            the parsed tokens
         */
        protected abstract void processLine(Integer lineNumber,
                        List<String> header, List<String> tokens);
 
        public synchronized void parse(InputStream in) {
+               parse(in, null);
+       }
+
+       public synchronized void parse(InputStream in, String encoding) {
                BufferedReader reader = null;
                Integer lineCount = 0;
                try {
-                       reader = new BufferedReader(new InputStreamReader(in));
-
+                       if (encoding == null)
+                               reader = new BufferedReader(new InputStreamReader(in));
+                       else
+                               reader = new BufferedReader(new InputStreamReader(in, encoding));
                        List<String> header = null;
                        if (!noHeader) {
                                String headerStr = reader.readLine();
                                if (headerStr == null)// empty file
                                        return;
                                lineCount++;
-                               header = Collections.unmodifiableList(parseLine(headerStr));
+                               header = new ArrayList<String>();
+                               StringBuffer currStr = new StringBuffer("");
+                               Boolean wasInquote = false;
+                               while (parseLine(headerStr, header, currStr, wasInquote)) {
+                                       headerStr = reader.readLine();
+                                       if (headerStr == null)
+                                               break;
+                                       wasInquote = true;
+                               }
+                               header = Collections.unmodifiableList(header);
                        }
 
                        String line = null;
                        lines: while ((line = reader.readLine()) != null) {
+                               line = preProcessLine(line);
+                               if (line == null) {
+                                       // skip line
+                                       continue lines;
+                               }
                                lineCount++;
-                               List<String> tokens = parseLine(line);
+                               List<String> tokens = new ArrayList<String>();
+                               StringBuffer currStr = new StringBuffer("");
+                               Boolean wasInquote = false;
+                               sublines: while (parseLine(line, tokens, currStr, wasInquote)) {
+                                       line = reader.readLine();
+                                       if (line == null)
+                                               break sublines;
+                                       wasInquote = true;
+                               }
                                if (!noHeader && strictLineAsLongAsHeader) {
                                        int headerSize = header.size();
                                        int tokenSize = tokens.size();
@@ -88,19 +131,44 @@ public abstract class CsvParser {
                }
        }
 
-       /** Parses a line character by character for performance purpose */
-       protected List<String> parseLine(String str) {
-               List<String> tokens = new ArrayList<String>();
+       /**
+        * Called before each (logical) line is processed, giving a change to modify
+        * it (typically for cleaning dirty files). To be overridden, return the
+        * line unchanged by default. Skip the line if 'null' is returned.
+        */
+       protected String preProcessLine(String line) {
+               return line;
+       }
+
+       /**
+        * Parses a line character by character for performance purpose
+        * 
+        * @return whether to continue parsing this line
+        */
+       protected Boolean parseLine(String str, List<String> tokens,
+                       StringBuffer currStr, Boolean wasInquote) {
+               // List<String> tokens = new ArrayList<String>();
+
+               // System.out.println("#LINE: " + str);
+
+               if (wasInquote)
+                       currStr.append('\n');
 
                char[] arr = str.toCharArray();
-               boolean inQuote = false;
-               StringBuffer currStr = new StringBuffer("");
+               boolean inQuote = wasInquote;
+               // StringBuffer currStr = new StringBuffer("");
                for (int i = 0; i < arr.length; i++) {
                        char c = arr[i];
                        if (c == separator) {
                                if (!inQuote) {
                                        tokens.add(currStr.toString());
-                                       currStr = new StringBuffer("");
+                                       // System.out.println("# TOKEN: " + currStr);
+                                       currStr.delete(0, currStr.length());
+                               } else {
+                                       // we don't remove separator that are in a quoted substring
+                                       // System.out
+                                       // .println("IN QUOTE, got a separator: [" + c + "]");
+                                       currStr.append(c);
                                }
                        } else if (c == quote) {
                                if (inQuote && (i + 1) < arr.length && arr[i + 1] == quote) {
@@ -114,11 +182,19 @@ public abstract class CsvParser {
                                currStr.append(c);
                        }
                }
-               tokens.add(currStr.toString());
+
+               if (!inQuote) {
+                       tokens.add(currStr.toString());
+                       // System.out.println("# TOKEN: " + currStr);
+               }
+               // if (inQuote)
+               // throw new ArgeoException("Missing quote at the end of the line "
+               // + str + " (parsed: " + tokens + ")");
                if (inQuote)
-                       throw new ArgeoException("Missing quote at the end of the line "
-                                       + str + " (parsed: " + tokens + ")");
-               return tokens;
+                       return true;
+               else
+                       return false;
+               // return tokens;
        }
 
        public char getSeparator() {