]> git.argeo.org Git - lgpl/argeo-commons.git/blob - org.argeo.util/src/org/argeo/util/CsvParser.java
Improve ACR, introduce migration from JCR.
[lgpl/argeo-commons.git] / org.argeo.util / src / org / argeo / util / CsvParser.java
1 package org.argeo.util;
2
3 import java.io.BufferedReader;
4 import java.io.IOException;
5 import java.io.InputStream;
6 import java.io.InputStreamReader;
7 import java.io.Reader;
8 import java.io.UnsupportedEncodingException;
9 import java.nio.charset.Charset;
10 import java.util.ArrayList;
11 import java.util.Collections;
12 import java.util.List;
13
14 /**
15 * Parses a CSV file interpreting the first line as a header. The
16 * {@link #parse(InputStream)} method and the setters are synchronized so that
17 * the object cannot be modified when parsing.
18 */
19 public abstract class CsvParser {
20 private char separator = ',';
21 private char quote = '\"';
22
23 private Boolean noHeader = false;
24 private Boolean strictLineAsLongAsHeader = true;
25
26 /**
27 * Actually process a parsed line. If
28 * {@link #setStrictLineAsLongAsHeader(Boolean)} is true (default) the header
29 * and the tokens are guaranteed to have the same size.
30 *
31 * @param lineNumber the current line number, starts at 1 (the header, if header
32 * processing is enabled, the first line otherwise)
33 * @param header the read-only header or null if
34 * {@link #setNoHeader(Boolean)} is true (default is false)
35 * @param tokens the parsed tokens
36 */
37 protected abstract void processLine(Integer lineNumber, List<String> header, List<String> tokens);
38
39 /**
40 * Parses the CSV file (stream is closed at the end)
41 *
42 * @param in the stream to parse
43 *
44 * @deprecated Use {@link #parse(InputStream, Charset)} instead.
45 */
46 @Deprecated
47 public synchronized void parse(InputStream in) {
48 parse(in, (Charset) null);
49 }
50
51 /**
52 * Parses the CSV file (stream is closed at the end)
53 *
54 * @param in the stream to parse
55 * @param encoding the encoding to use.
56 *
57 * @deprecated Use {@link #parse(InputStream, Charset)} instead.
58 */
59 @Deprecated
60 public synchronized void parse(InputStream in, String encoding) {
61 Reader reader;
62 if (encoding == null)
63 reader = new InputStreamReader(in);
64 else
65 try {
66 reader = new InputStreamReader(in, encoding);
67 } catch (UnsupportedEncodingException e) {
68 throw new IllegalArgumentException(e);
69 }
70 parse(reader);
71 }
72
73 /**
74 * Parses the CSV file (stream is closed at the end)
75 *
76 * @param in the stream to parse
77 * @param charset the charset to use
78 */
79 public synchronized void parse(InputStream in, Charset charset) {
80 Reader reader;
81 if (charset == null)
82 reader = new InputStreamReader(in);
83 else
84 reader = new InputStreamReader(in, charset);
85 parse(reader);
86 }
87
88 /**
89 * Parses the CSV file (stream is closed at the end)
90 *
91 * @param reader the reader to use (it will be buffered)
92 */
93 public synchronized void parse(Reader reader) {
94 Integer lineCount = 0;
95 try (BufferedReader bufferedReader = new BufferedReader(reader)) {
96 List<String> header = null;
97 if (!noHeader) {
98 String headerStr = bufferedReader.readLine();
99 if (headerStr == null)// empty file
100 return;
101 lineCount++;
102 header = new ArrayList<String>();
103 StringBuffer currStr = new StringBuffer("");
104 Boolean wasInquote = false;
105 while (parseLine(headerStr, header, currStr, wasInquote)) {
106 headerStr = bufferedReader.readLine();
107 if (headerStr == null)
108 break;
109 wasInquote = true;
110 }
111 header = Collections.unmodifiableList(header);
112 }
113
114 String line = null;
115 lines: while ((line = bufferedReader.readLine()) != null) {
116 line = preProcessLine(line);
117 if (line == null) {
118 // skip line
119 continue lines;
120 }
121 lineCount++;
122 List<String> tokens = new ArrayList<String>();
123 StringBuffer currStr = new StringBuffer("");
124 Boolean wasInquote = false;
125 sublines: while (parseLine(line, tokens, currStr, wasInquote)) {
126 line = bufferedReader.readLine();
127 if (line == null)
128 break sublines;
129 wasInquote = true;
130 }
131 if (!noHeader && strictLineAsLongAsHeader) {
132 int headerSize = header.size();
133 int tokenSize = tokens.size();
134 if (tokenSize == 1 && line.trim().equals(""))
135 continue lines;// empty line
136 if (headerSize != tokenSize) {
137 throw new IllegalStateException("Token size " + tokenSize + " is different from header size "
138 + headerSize + " at line " + lineCount + ", line: " + line + ", header: " + header
139 + ", tokens: " + tokens);
140 }
141 }
142 processLine(lineCount, header, tokens);
143 }
144 } catch (IOException e) {
145 throw new RuntimeException("Cannot parse CSV file (line: " + lineCount + ")", e);
146 }
147 }
148
149 /**
150 * Called before each (logical) line is processed, giving a change to modify it
151 * (typically for cleaning dirty files). To be overridden, return the line
152 * unchanged by default. Skip the line if 'null' is returned.
153 */
154 protected String preProcessLine(String line) {
155 return line;
156 }
157
158 /**
159 * Parses a line character by character for performance purpose
160 *
161 * @return whether to continue parsing this line
162 */
163 protected Boolean parseLine(String str, List<String> tokens, StringBuffer currStr, Boolean wasInquote) {
164 if (wasInquote)
165 currStr.append('\n');
166
167 char[] arr = str.toCharArray();
168 boolean inQuote = wasInquote;
169 for (int i = 0; i < arr.length; i++) {
170 char c = arr[i];
171 if (c == separator) {
172 if (!inQuote) {
173 tokens.add(currStr.toString());
174 // currStr.delete(0, currStr.length());
175 currStr.setLength(0);
176 currStr.trimToSize();
177 } else {
178 // we don't remove separator that are in a quoted substring
179 // System.out
180 // .println("IN QUOTE, got a separator: [" + c + "]");
181 currStr.append(c);
182 }
183 } else if (c == quote) {
184 if (inQuote && (i + 1) < arr.length && arr[i + 1] == quote) {
185 // case of double quote
186 currStr.append(quote);
187 i++;
188 } else {// standard
189 inQuote = inQuote ? false : true;
190 }
191 } else {
192 currStr.append(c);
193 }
194 }
195
196 if (!inQuote) {
197 tokens.add(currStr.toString());
198 // System.out.println("# TOKEN: " + currStr);
199 }
200 // if (inQuote)
201 // throw new ArgeoException("Missing quote at the end of the line "
202 // + str + " (parsed: " + tokens + ")");
203 if (inQuote)
204 return true;
205 else
206 return false;
207 // return tokens;
208 }
209
210 public char getSeparator() {
211 return separator;
212 }
213
214 public synchronized void setSeparator(char separator) {
215 this.separator = separator;
216 }
217
218 public char getQuote() {
219 return quote;
220 }
221
222 public synchronized void setQuote(char quote) {
223 this.quote = quote;
224 }
225
226 public Boolean getNoHeader() {
227 return noHeader;
228 }
229
230 public synchronized void setNoHeader(Boolean noHeader) {
231 this.noHeader = noHeader;
232 }
233
234 public Boolean getStrictLineAsLongAsHeader() {
235 return strictLineAsLongAsHeader;
236 }
237
238 public synchronized void setStrictLineAsLongAsHeader(Boolean strictLineAsLongAsHeader) {
239 this.strictLineAsLongAsHeader = strictLineAsLongAsHeader;
240 }
241
242 }