1 package org
.argeo
.util
;
3 import java
.io
.BufferedReader
;
4 import java
.io
.IOException
;
5 import java
.io
.InputStream
;
6 import java
.io
.InputStreamReader
;
7 import java
.util
.ArrayList
;
8 import java
.util
.Collections
;
12 * Parses a CSV file interpreting the first line as a header. The
13 * {@link #parse(InputStream)} method and the setters are synchronized so that
14 * the object cannot be modified when parsing.
16 public abstract class CsvParser
{
17 private char separator
= ',';
18 private char quote
= '\"';
20 private Boolean noHeader
= false;
21 private Boolean strictLineAsLongAsHeader
= true;
24 * Actually process a parsed line. If
25 * {@link #setStrictLineAsLongAsHeader(Boolean)} is true (default) the
26 * header and the tokens are guaranteed to have the same size.
29 * the current line number, starts at 1 (the header, if header
30 * processing is enabled, the first line otherwise)
32 * the read-only header or null if {@link #setNoHeader(Boolean)}
33 * is true (default is false)
37 protected abstract void processLine(Integer lineNumber
,
38 List
<String
> header
, List
<String
> tokens
);
41 * Parses the CSV file (stream is closed at the end)
43 public synchronized void parse(InputStream in
) {
48 * Parses the CSV file (stream is closed at the end)
50 public synchronized void parse(InputStream in
, String encoding
) {
51 BufferedReader reader
= null;
52 Integer lineCount
= 0;
55 reader
= new BufferedReader(new InputStreamReader(in
));
57 reader
= new BufferedReader(new InputStreamReader(in
, encoding
));
58 List
<String
> header
= null;
60 String headerStr
= reader
.readLine();
61 if (headerStr
== null)// empty file
64 header
= new ArrayList
<String
>();
65 StringBuffer currStr
= new StringBuffer("");
66 Boolean wasInquote
= false;
67 while (parseLine(headerStr
, header
, currStr
, wasInquote
)) {
68 headerStr
= reader
.readLine();
69 if (headerStr
== null)
73 header
= Collections
.unmodifiableList(header
);
77 lines
: while ((line
= reader
.readLine()) != null) {
78 line
= preProcessLine(line
);
84 List
<String
> tokens
= new ArrayList
<String
>();
85 StringBuffer currStr
= new StringBuffer("");
86 Boolean wasInquote
= false;
87 sublines
: while (parseLine(line
, tokens
, currStr
, wasInquote
)) {
88 line
= reader
.readLine();
93 if (!noHeader
&& strictLineAsLongAsHeader
) {
94 int headerSize
= header
.size();
95 int tokenSize
= tokens
.size();
96 if (tokenSize
== 1 && line
.trim().equals(""))
97 continue lines
;// empty line
98 if (headerSize
!= tokenSize
) {
99 throw new UtilsException("Token size " + tokenSize
100 + " is different from header size "
101 + headerSize
+ " at line " + lineCount
102 + ", line: " + line
+ ", header: " + header
103 + ", tokens: " + tokens
);
106 processLine(lineCount
, header
, tokens
);
108 } catch (UtilsException e
) {
110 } catch (IOException e
) {
111 throw new UtilsException("Cannot parse CSV file (line: "
112 + lineCount
+ ")", e
);
114 StreamUtils
.closeQuietly(reader
);
119 * Called before each (logical) line is processed, giving a change to modify
120 * it (typically for cleaning dirty files). To be overridden, return the
121 * line unchanged by default. Skip the line if 'null' is returned.
123 protected String
preProcessLine(String line
) {
128 * Parses a line character by character for performance purpose
130 * @return whether to continue parsing this line
132 protected Boolean
parseLine(String str
, List
<String
> tokens
,
133 StringBuffer currStr
, Boolean wasInquote
) {
134 // List<String> tokens = new ArrayList<String>();
136 // System.out.println("#LINE: " + str);
139 currStr
.append('\n');
141 char[] arr
= str
.toCharArray();
142 boolean inQuote
= wasInquote
;
143 // StringBuffer currStr = new StringBuffer("");
144 for (int i
= 0; i
< arr
.length
; i
++) {
146 if (c
== separator
) {
148 tokens
.add(currStr
.toString());
149 // System.out.println("# TOKEN: " + currStr);
150 currStr
.delete(0, currStr
.length());
152 // we don't remove separator that are in a quoted substring
154 // .println("IN QUOTE, got a separator: [" + c + "]");
157 } else if (c
== quote
) {
158 if (inQuote
&& (i
+ 1) < arr
.length
&& arr
[i
+ 1] == quote
) {
159 // case of double quote
160 currStr
.append(quote
);
163 inQuote
= inQuote ?
false : true;
171 tokens
.add(currStr
.toString());
172 // System.out.println("# TOKEN: " + currStr);
175 // throw new ArgeoException("Missing quote at the end of the line "
176 // + str + " (parsed: " + tokens + ")");
184 public char getSeparator() {
188 public synchronized void setSeparator(char separator
) {
189 this.separator
= separator
;
192 public char getQuote() {
196 public synchronized void setQuote(char quote
) {
200 public Boolean
getNoHeader() {
204 public synchronized void setNoHeader(Boolean noHeader
) {
205 this.noHeader
= noHeader
;
208 public Boolean
getStrictLineAsLongAsHeader() {
209 return strictLineAsLongAsHeader
;
212 public synchronized void setStrictLineAsLongAsHeader(
213 Boolean strictLineAsLongAsHeader
) {
214 this.strictLineAsLongAsHeader
= strictLineAsLongAsHeader
;