X-Git-Url: https://git.argeo.org/?a=blobdiff_plain;f=basic%2Fruntime%2Forg.argeo.basic.nodeps%2Fsrc%2Fmain%2Fjava%2Forg%2Fargeo%2Futil%2FCsvParser.java;h=127d0f50928f9b358c5f376a9ba8704f9d9ba697;hb=1d5afdce3e91054f07ddd3c98309c363b4cf1d46;hp=4b4d0c8762016320fc8054aa281f2993a24506ff;hpb=56472382695a908e322c711070116aa64ca53b85;p=lgpl%2Fargeo-commons.git diff --git a/basic/runtime/org.argeo.basic.nodeps/src/main/java/org/argeo/util/CsvParser.java b/basic/runtime/org.argeo.basic.nodeps/src/main/java/org/argeo/util/CsvParser.java index 4b4d0c876..127d0f509 100644 --- a/basic/runtime/org.argeo.basic.nodeps/src/main/java/org/argeo/util/CsvParser.java +++ b/basic/runtime/org.argeo.basic.nodeps/src/main/java/org/argeo/util/CsvParser.java @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2007-2012 Mathieu Baudier + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.argeo.util; import java.io.BufferedReader; @@ -29,35 +44,63 @@ public abstract class CsvParser { * * @param lineNumber * the current line number, starts at 1 (the header, if header - * processing is enabled, the first lien otherwise) + * processing is enabled, the first line otherwise) * @param header * the read-only header or null if {@link #setNoHeader(Boolean)} * is true (default is false) * @param tokens - * the parse tokens + * the parsed tokens */ protected abstract void processLine(Integer lineNumber, List header, List tokens); public synchronized void parse(InputStream in) { + parse(in, null); + } + + public synchronized void parse(InputStream in, String encoding) { BufferedReader reader = null; Integer lineCount = 0; try { - reader = new BufferedReader(new InputStreamReader(in)); - + if (encoding == null) + reader = new BufferedReader(new InputStreamReader(in)); + else + reader = new BufferedReader(new InputStreamReader(in, encoding)); List header = null; if (!noHeader) { String headerStr = reader.readLine(); if (headerStr == null)// empty file return; lineCount++; - header = Collections.unmodifiableList(parseLine(headerStr)); + header = new ArrayList(); + StringBuffer currStr = new StringBuffer(""); + Boolean wasInquote = false; + while (parseLine(headerStr, header, currStr, wasInquote)) { + headerStr = reader.readLine(); + if (headerStr == null) + break; + wasInquote = true; + } + header = Collections.unmodifiableList(header); } String line = null; lines: while ((line = reader.readLine()) != null) { + line = preProcessLine(line); + if (line == null) { + // skip line + continue lines; + } lineCount++; - List tokens = parseLine(line); + List tokens = new ArrayList(); + StringBuffer currStr = new StringBuffer(""); + Boolean wasInquote = false; + sublines: while (parseLine(line, tokens, currStr, wasInquote)) { + line = reader.readLine(); + if (line == null) + break sublines; + wasInquote = true; + } if (!noHeader && strictLineAsLongAsHeader) { int headerSize = header.size(); int tokenSize = tokens.size(); @@ -88,19 +131,44 @@ public abstract class CsvParser { } } - /** Parses a line character by character for performance purpose */ - protected List parseLine(String str) { - List tokens = new ArrayList(); + /** + * Called before each (logical) line is processed, giving a change to modify + * it (typically for cleaning dirty files). To be overridden, return the + * line unchanged by default. Skip the line if 'null' is returned. + */ + protected String preProcessLine(String line) { + return line; + } + + /** + * Parses a line character by character for performance purpose + * + * @return whether to continue parsing this line + */ + protected Boolean parseLine(String str, List tokens, + StringBuffer currStr, Boolean wasInquote) { + // List tokens = new ArrayList(); + + // System.out.println("#LINE: " + str); + + if (wasInquote) + currStr.append('\n'); char[] arr = str.toCharArray(); - boolean inQuote = false; - StringBuffer currStr = new StringBuffer(""); + boolean inQuote = wasInquote; + // StringBuffer currStr = new StringBuffer(""); for (int i = 0; i < arr.length; i++) { char c = arr[i]; if (c == separator) { if (!inQuote) { tokens.add(currStr.toString()); - currStr = new StringBuffer(""); + // System.out.println("# TOKEN: " + currStr); + currStr.delete(0, currStr.length()); + } else { + // we don't remove separator that are in a quoted substring + // System.out + // .println("IN QUOTE, got a separator: [" + c + "]"); + currStr.append(c); } } else if (c == quote) { if (inQuote && (i + 1) < arr.length && arr[i + 1] == quote) { @@ -114,11 +182,19 @@ public abstract class CsvParser { currStr.append(c); } } - tokens.add(currStr.toString()); + + if (!inQuote) { + tokens.add(currStr.toString()); + // System.out.println("# TOKEN: " + currStr); + } + // if (inQuote) + // throw new ArgeoException("Missing quote at the end of the line " + // + str + " (parsed: " + tokens + ")"); if (inQuote) - throw new ArgeoException("Missing quote at the end of the line " - + str + " (parsed: " + tokens + ")"); - return tokens; + return true; + else + return false; + // return tokens; } public char getSeparator() {