2 * Copyright (C) 2007-2012 Argeo GmbH
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
16 package org
.argeo
.util
;
18 import java
.io
.BufferedReader
;
19 import java
.io
.IOException
;
20 import java
.io
.InputStream
;
21 import java
.io
.InputStreamReader
;
22 import java
.util
.ArrayList
;
23 import java
.util
.Collections
;
24 import java
.util
.List
;
26 import org
.argeo
.ArgeoException
;
27 import org
.argeo
.StreamUtils
;
30 * Parses a CSV file interpreting the first line as a header. The
31 * {@link #parse(InputStream)} method and the setters are synchronized so that
32 * the object cannot be modified when parsing.
34 public abstract class CsvParser
{
35 private char separator
= ',';
36 private char quote
= '\"';
38 private Boolean noHeader
= false;
39 private Boolean strictLineAsLongAsHeader
= true;
42 * Actually process a parsed line. If
43 * {@link #setStrictLineAsLongAsHeader(Boolean)} is true (default) the
44 * header and the tokens are guaranteed to have the same size.
47 * the current line number, starts at 1 (the header, if header
48 * processing is enabled, the first line otherwise)
50 * the read-only header or null if {@link #setNoHeader(Boolean)}
51 * is true (default is false)
55 protected abstract void processLine(Integer lineNumber
,
56 List
<String
> header
, List
<String
> tokens
);
59 * Parses the CSV file (stream is closed at the end)
61 public synchronized void parse(InputStream in
) {
66 * Parses the CSV file (stream is closed at the end)
68 public synchronized void parse(InputStream in
, String encoding
) {
69 BufferedReader reader
= null;
70 Integer lineCount
= 0;
73 reader
= new BufferedReader(new InputStreamReader(in
));
75 reader
= new BufferedReader(new InputStreamReader(in
, encoding
));
76 List
<String
> header
= null;
78 String headerStr
= reader
.readLine();
79 if (headerStr
== null)// empty file
82 header
= new ArrayList
<String
>();
83 StringBuffer currStr
= new StringBuffer("");
84 Boolean wasInquote
= false;
85 while (parseLine(headerStr
, header
, currStr
, wasInquote
)) {
86 headerStr
= reader
.readLine();
87 if (headerStr
== null)
91 header
= Collections
.unmodifiableList(header
);
95 lines
: while ((line
= reader
.readLine()) != null) {
96 line
= preProcessLine(line
);
102 List
<String
> tokens
= new ArrayList
<String
>();
103 StringBuffer currStr
= new StringBuffer("");
104 Boolean wasInquote
= false;
105 sublines
: while (parseLine(line
, tokens
, currStr
, wasInquote
)) {
106 line
= reader
.readLine();
111 if (!noHeader
&& strictLineAsLongAsHeader
) {
112 int headerSize
= header
.size();
113 int tokenSize
= tokens
.size();
114 if (tokenSize
== 1 && line
.trim().equals(""))
115 continue lines
;// empty line
116 if (headerSize
!= tokenSize
) {
117 throw new ArgeoException("Token size " + tokenSize
118 + " is different from header size "
119 + headerSize
+ " at line " + lineCount
120 + ", line: " + line
+ ", header: " + header
121 + ", tokens: " + tokens
);
124 processLine(lineCount
, header
, tokens
);
126 } catch (ArgeoException e
) {
128 } catch (IOException e
) {
129 throw new ArgeoException("Cannot parse CSV file (line: "
130 + lineCount
+ ")", e
);
132 StreamUtils
.closeQuietly(reader
);
137 * Called before each (logical) line is processed, giving a change to modify
138 * it (typically for cleaning dirty files). To be overridden, return the
139 * line unchanged by default. Skip the line if 'null' is returned.
141 protected String
preProcessLine(String line
) {
146 * Parses a line character by character for performance purpose
148 * @return whether to continue parsing this line
150 protected Boolean
parseLine(String str
, List
<String
> tokens
,
151 StringBuffer currStr
, Boolean wasInquote
) {
152 // List<String> tokens = new ArrayList<String>();
154 // System.out.println("#LINE: " + str);
157 currStr
.append('\n');
159 char[] arr
= str
.toCharArray();
160 boolean inQuote
= wasInquote
;
161 // StringBuffer currStr = new StringBuffer("");
162 for (int i
= 0; i
< arr
.length
; i
++) {
164 if (c
== separator
) {
166 tokens
.add(currStr
.toString());
167 // System.out.println("# TOKEN: " + currStr);
168 currStr
.delete(0, currStr
.length());
170 // we don't remove separator that are in a quoted substring
172 // .println("IN QUOTE, got a separator: [" + c + "]");
175 } else if (c
== quote
) {
176 if (inQuote
&& (i
+ 1) < arr
.length
&& arr
[i
+ 1] == quote
) {
177 // case of double quote
178 currStr
.append(quote
);
181 inQuote
= inQuote ?
false : true;
189 tokens
.add(currStr
.toString());
190 // System.out.println("# TOKEN: " + currStr);
193 // throw new ArgeoException("Missing quote at the end of the line "
194 // + str + " (parsed: " + tokens + ")");
202 public char getSeparator() {
206 public synchronized void setSeparator(char separator
) {
207 this.separator
= separator
;
210 public char getQuote() {
214 public synchronized void setQuote(char quote
) {
218 public Boolean
getNoHeader() {
222 public synchronized void setNoHeader(Boolean noHeader
) {
223 this.noHeader
= noHeader
;
226 public Boolean
getStrictLineAsLongAsHeader() {
227 return strictLineAsLongAsHeader
;
230 public synchronized void setStrictLineAsLongAsHeader(
231 Boolean strictLineAsLongAsHeader
) {
232 this.strictLineAsLongAsHeader
= strictLineAsLongAsHeader
;