]> git.argeo.org Git - lgpl/argeo-commons.git/blob - CsvParser.java
3a429f4adc23a634dfcff402203a20e2c83e04d0
[lgpl/argeo-commons.git] / CsvParser.java
1 package org.argeo.util;
2
3 import java.io.BufferedReader;
4 import java.io.IOException;
5 import java.io.InputStream;
6 import java.io.InputStreamReader;
7 import java.util.ArrayList;
8 import java.util.Collections;
9 import java.util.List;
10
11 import org.argeo.ArgeoException;
12
13 /**
14 * Parses a CSV file interpreting the first line as a header. The
15 * {@link #parse(InputStream)} method and the setters are synchronized so that
16 * the object cannot be modified when parsing.
17 */
18 public abstract class CsvParser {
19 private char separator = ',';
20 private char quote = '\"';
21
22 private Boolean noHeader = false;
23 private Boolean strictLineAsLongAsHeader = true;
24
25 /**
26 * Actually process a parsed line. If
27 * {@link #setStrictLineAsLongAsHeader(Boolean)} is true (default) the
28 * header and the tokens are guaranteed to have the same size.
29 *
30 * @param lineNumber
31 * the current line number, starts at 1 (the header, if header
32 * processing is enabled, the first line otherwise)
33 * @param header
34 * the read-only header or null if {@link #setNoHeader(Boolean)}
35 * is true (default is false)
36 * @param tokens
37 * the parsed tokens
38 */
39 protected abstract void processLine(Integer lineNumber,
40 List<String> header, List<String> tokens);
41
42 public synchronized void parse(InputStream in) {
43 parse(in, null);
44 }
45
46 public synchronized void parse(InputStream in, String encoding) {
47 BufferedReader reader = null;
48 Integer lineCount = 0;
49 try {
50 if (encoding == null)
51 reader = new BufferedReader(new InputStreamReader(in));
52 else
53 reader = new BufferedReader(new InputStreamReader(in, encoding));
54 List<String> header = null;
55 if (!noHeader) {
56 String headerStr = reader.readLine();
57 if (headerStr == null)// empty file
58 return;
59 lineCount++;
60 header = new ArrayList<String>();
61 StringBuffer currStr = new StringBuffer("");
62 Boolean wasInquote = false;
63 while (parseLine(headerStr, header, currStr, wasInquote)) {
64 headerStr = reader.readLine();
65 if (headerStr == null)
66 break;
67 wasInquote = true;
68 }
69 header = Collections.unmodifiableList(header);
70 }
71
72 String line = null;
73 lines: while ((line = reader.readLine()) != null) {
74 lineCount++;
75 List<String> tokens = new ArrayList<String>();
76 StringBuffer currStr = new StringBuffer("");
77 Boolean wasInquote = false;
78 while (parseLine(line, tokens, currStr, wasInquote)) {
79 line = reader.readLine();
80 if (line == null)
81 break;
82 wasInquote = true;
83 }
84 if (!noHeader && strictLineAsLongAsHeader) {
85 int headerSize = header.size();
86 int tokenSize = tokens.size();
87 if (tokenSize == 1 && line.trim().equals(""))
88 continue lines;// empty line
89 if (headerSize != tokenSize) {
90 throw new ArgeoException("Token size " + tokenSize
91 + " is different from header size "
92 + headerSize + " at line " + lineCount
93 + ", line: " + line + ", header: " + header
94 + ", tokens: " + tokens);
95 }
96 }
97 processLine(lineCount, header, tokens);
98 }
99 } catch (ArgeoException e) {
100 throw e;
101 } catch (IOException e) {
102 throw new ArgeoException("Cannot parse CSV file (line: "
103 + lineCount + ")", e);
104 } finally {
105 if (reader != null)
106 try {
107 reader.close();
108 } catch (Exception e2) {
109 // silent
110 }
111 }
112 }
113
114 /**
115 * Parses a line character by character for performance purpose
116 *
117 * @return whether to continue parsing this line
118 */
119 protected Boolean parseLine(String str, List<String> tokens,
120 StringBuffer currStr, Boolean wasInquote) {
121 // List<String> tokens = new ArrayList<String>();
122
123 // System.out.println("#LINE: " + str);
124
125 if (wasInquote)
126 currStr.append('\n');
127
128 char[] arr = str.toCharArray();
129 boolean inQuote = wasInquote;
130 // StringBuffer currStr = new StringBuffer("");
131 for (int i = 0; i < arr.length; i++) {
132 char c = arr[i];
133 if (c == separator) {
134 if (!inQuote) {
135 tokens.add(currStr.toString());
136 // System.out.println("# TOKEN: " + currStr);
137 currStr.delete(0, currStr.length());
138 } else {
139 // we don't remove separator that are in a quoted substring
140 // System.out
141 // .println("IN QUOTE, got a separator: [" + c + "]");
142 currStr.append(c);
143 }
144 } else if (c == quote) {
145 if (inQuote && (i + 1) < arr.length && arr[i + 1] == quote) {
146 // case of double quote
147 currStr.append(quote);
148 i++;
149 } else {// standard
150 inQuote = inQuote ? false : true;
151 }
152 } else {
153 currStr.append(c);
154 }
155 }
156
157 if (!inQuote) {
158 tokens.add(currStr.toString());
159 // System.out.println("# TOKEN: " + currStr);
160 }
161 // if (inQuote)
162 // throw new ArgeoException("Missing quote at the end of the line "
163 // + str + " (parsed: " + tokens + ")");
164 if (inQuote)
165 return true;
166 else
167 return false;
168 // return tokens;
169 }
170
171 public char getSeparator() {
172 return separator;
173 }
174
175 public synchronized void setSeparator(char separator) {
176 this.separator = separator;
177 }
178
179 public char getQuote() {
180 return quote;
181 }
182
183 public synchronized void setQuote(char quote) {
184 this.quote = quote;
185 }
186
187 public Boolean getNoHeader() {
188 return noHeader;
189 }
190
191 public synchronized void setNoHeader(Boolean noHeader) {
192 this.noHeader = noHeader;
193 }
194
195 public Boolean getStrictLineAsLongAsHeader() {
196 return strictLineAsLongAsHeader;
197 }
198
199 public synchronized void setStrictLineAsLongAsHeader(
200 Boolean strictLineAsLongAsHeader) {
201 this.strictLineAsLongAsHeader = strictLineAsLongAsHeader;
202 }
203
204 }