1 package org
.argeo
.util
;
3 import java
.io
.BufferedReader
;
4 import java
.io
.IOException
;
5 import java
.io
.InputStream
;
6 import java
.io
.InputStreamReader
;
8 import java
.io
.UnsupportedEncodingException
;
9 import java
.nio
.charset
.Charset
;
10 import java
.util
.ArrayList
;
11 import java
.util
.Collections
;
12 import java
.util
.List
;
15 * Parses a CSV file interpreting the first line as a header. The
16 * {@link #parse(InputStream)} method and the setters are synchronized so that
17 * the object cannot be modified when parsing.
19 public abstract class CsvParser
{
20 private char separator
= ',';
21 private char quote
= '\"';
23 private Boolean noHeader
= false;
24 private Boolean strictLineAsLongAsHeader
= true;
27 * Actually process a parsed line. If
28 * {@link #setStrictLineAsLongAsHeader(Boolean)} is true (default) the header
29 * and the tokens are guaranteed to have the same size.
31 * @param lineNumber the current line number, starts at 1 (the header, if header
32 * processing is enabled, the first line otherwise)
33 * @param header the read-only header or null if
34 * {@link #setNoHeader(Boolean)} is true (default is false)
35 * @param tokens the parsed tokens
37 protected abstract void processLine(Integer lineNumber
, List
<String
> header
, List
<String
> tokens
);
40 * Parses the CSV file (stream is closed at the end)
42 * @param in the stream to parse
44 * @deprecated Use {@link #parse(InputStream, Charset)} instead.
47 public synchronized void parse(InputStream in
) {
48 parse(in
, (Charset
) null);
52 * Parses the CSV file (stream is closed at the end)
54 * @param in the stream to parse
55 * @param encoding the encoding to use.
57 * @deprecated Use {@link #parse(InputStream, Charset)} instead.
60 public synchronized void parse(InputStream in
, String encoding
) {
63 reader
= new InputStreamReader(in
);
66 reader
= new InputStreamReader(in
, encoding
);
67 } catch (UnsupportedEncodingException e
) {
68 throw new IllegalArgumentException(e
);
74 * Parses the CSV file (stream is closed at the end)
76 * @param in the stream to parse
77 * @param charset the charset to use
79 public synchronized void parse(InputStream in
, Charset charset
) {
82 reader
= new InputStreamReader(in
);
84 reader
= new InputStreamReader(in
, charset
);
89 * Parses the CSV file (stream is closed at the end)
91 * @param reader the reader to use (it will be buffered)
93 public synchronized void parse(Reader reader
) {
94 Integer lineCount
= 0;
95 try (BufferedReader bufferedReader
= new BufferedReader(reader
)) {
96 List
<String
> header
= null;
98 String headerStr
= bufferedReader
.readLine();
99 if (headerStr
== null)// empty file
102 header
= new ArrayList
<String
>();
103 StringBuffer currStr
= new StringBuffer("");
104 Boolean wasInquote
= false;
105 while (parseLine(headerStr
, header
, currStr
, wasInquote
)) {
106 headerStr
= bufferedReader
.readLine();
107 if (headerStr
== null)
111 header
= Collections
.unmodifiableList(header
);
115 lines
: while ((line
= bufferedReader
.readLine()) != null) {
116 line
= preProcessLine(line
);
122 List
<String
> tokens
= new ArrayList
<String
>();
123 StringBuffer currStr
= new StringBuffer("");
124 Boolean wasInquote
= false;
125 sublines
: while (parseLine(line
, tokens
, currStr
, wasInquote
)) {
126 line
= bufferedReader
.readLine();
131 if (!noHeader
&& strictLineAsLongAsHeader
) {
132 int headerSize
= header
.size();
133 int tokenSize
= tokens
.size();
134 if (tokenSize
== 1 && line
.trim().equals(""))
135 continue lines
;// empty line
136 if (headerSize
!= tokenSize
) {
137 throw new IllegalStateException("Token size " + tokenSize
+ " is different from header size "
138 + headerSize
+ " at line " + lineCount
+ ", line: " + line
+ ", header: " + header
139 + ", tokens: " + tokens
);
142 processLine(lineCount
, header
, tokens
);
144 } catch (IOException e
) {
145 throw new RuntimeException("Cannot parse CSV file (line: " + lineCount
+ ")", e
);
150 * Called before each (logical) line is processed, giving a change to modify it
151 * (typically for cleaning dirty files). To be overridden, return the line
152 * unchanged by default. Skip the line if 'null' is returned.
154 protected String
preProcessLine(String line
) {
159 * Parses a line character by character for performance purpose
161 * @return whether to continue parsing this line
163 protected Boolean
parseLine(String str
, List
<String
> tokens
, StringBuffer currStr
, Boolean wasInquote
) {
165 currStr
.append('\n');
167 char[] arr
= str
.toCharArray();
168 boolean inQuote
= wasInquote
;
169 for (int i
= 0; i
< arr
.length
; i
++) {
171 if (c
== separator
) {
173 tokens
.add(currStr
.toString());
174 // currStr.delete(0, currStr.length());
175 currStr
.setLength(0);
176 currStr
.trimToSize();
178 // we don't remove separator that are in a quoted substring
180 // .println("IN QUOTE, got a separator: [" + c + "]");
183 } else if (c
== quote
) {
184 if (inQuote
&& (i
+ 1) < arr
.length
&& arr
[i
+ 1] == quote
) {
185 // case of double quote
186 currStr
.append(quote
);
189 inQuote
= inQuote ?
false : true;
197 tokens
.add(currStr
.toString());
198 // System.out.println("# TOKEN: " + currStr);
201 // throw new ArgeoException("Missing quote at the end of the line "
202 // + str + " (parsed: " + tokens + ")");
210 public char getSeparator() {
214 public synchronized void setSeparator(char separator
) {
215 this.separator
= separator
;
218 public char getQuote() {
222 public synchronized void setQuote(char quote
) {
226 public Boolean
getNoHeader() {
230 public synchronized void setNoHeader(Boolean noHeader
) {
231 this.noHeader
= noHeader
;
234 public Boolean
getStrictLineAsLongAsHeader() {
235 return strictLineAsLongAsHeader
;
238 public synchronized void setStrictLineAsLongAsHeader(Boolean strictLineAsLongAsHeader
) {
239 this.strictLineAsLongAsHeader
= strictLineAsLongAsHeader
;