]> git.argeo.org Git - lgpl/argeo-commons.git/blob - basic/runtime/org.argeo.basic.nodeps/src/main/java/org/argeo/util/CsvParser.java
Deal with null values
[lgpl/argeo-commons.git] / basic / runtime / org.argeo.basic.nodeps / src / main / java / org / argeo / util / CsvParser.java
1 package org.argeo.util;
2
3 import java.io.BufferedReader;
4 import java.io.IOException;
5 import java.io.InputStream;
6 import java.io.InputStreamReader;
7 import java.util.ArrayList;
8 import java.util.Collections;
9 import java.util.List;
10
11 import org.argeo.ArgeoException;
12
13 /**
14 * Parses a CSV file interpreting the first line as a header. The
15 * {@link #parse(InputStream)} method and the setters are synchronized so that
16 * the object cannot be modified when parsing.
17 */
18 public abstract class CsvParser {
19 private char separator = ',';
20 private char quote = '\"';
21
22 private Boolean noHeader = false;
23 private Boolean strictLineAsLongAsHeader = true;
24
25 /**
26 * Actually process a parsed line. If
27 * {@link #setStrictLineAsLongAsHeader(Boolean)} is true (default) the
28 * header and the tokens are guaranteed to have the same size.
29 *
30 * @param lineNumber
31 * the current line number, starts at 1 (the header, if header
32 * processing is enabled, the first line otherwise)
33 * @param header
34 * the read-only header or null if {@link #setNoHeader(Boolean)}
35 * is true (default is false)
36 * @param tokens
37 * the parsed tokens
38 */
39 protected abstract void processLine(Integer lineNumber,
40 List<String> header, List<String> tokens);
41
42 public synchronized void parse(InputStream in) {
43 parse(in, null);
44 }
45
46 public synchronized void parse(InputStream in, String encoding) {
47 BufferedReader reader = null;
48 Integer lineCount = 0;
49 try {
50 if (encoding == null)
51 reader = new BufferedReader(new InputStreamReader(in));
52 else
53 reader = new BufferedReader(new InputStreamReader(in, encoding));
54 List<String> header = null;
55 if (!noHeader) {
56 String headerStr = reader.readLine();
57 if (headerStr == null)// empty file
58 return;
59 lineCount++;
60 header = new ArrayList<String>();
61 StringBuffer currStr = new StringBuffer("");
62 Boolean wasInquote = false;
63 while (parseLine(headerStr, header, currStr, wasInquote)) {
64 headerStr = reader.readLine();
65 if (headerStr == null)
66 break;
67 wasInquote = true;
68 }
69 header = Collections.unmodifiableList(header);
70 }
71
72 String line = null;
73 lines: while ((line = reader.readLine()) != null) {
74 line = preProcessLine(line);
75 if (line == null) {
76 // skip line
77 continue lines;
78 }
79 lineCount++;
80 List<String> tokens = new ArrayList<String>();
81 StringBuffer currStr = new StringBuffer("");
82 Boolean wasInquote = false;
83 sublines: while (parseLine(line, tokens, currStr, wasInquote)) {
84 line = reader.readLine();
85 if (line == null)
86 break sublines;
87 wasInquote = true;
88 }
89 if (!noHeader && strictLineAsLongAsHeader) {
90 int headerSize = header.size();
91 int tokenSize = tokens.size();
92 if (tokenSize == 1 && line.trim().equals(""))
93 continue lines;// empty line
94 if (headerSize != tokenSize) {
95 throw new ArgeoException("Token size " + tokenSize
96 + " is different from header size "
97 + headerSize + " at line " + lineCount
98 + ", line: " + line + ", header: " + header
99 + ", tokens: " + tokens);
100 }
101 }
102 processLine(lineCount, header, tokens);
103 }
104 } catch (ArgeoException e) {
105 throw e;
106 } catch (IOException e) {
107 throw new ArgeoException("Cannot parse CSV file (line: "
108 + lineCount + ")", e);
109 } finally {
110 if (reader != null)
111 try {
112 reader.close();
113 } catch (Exception e2) {
114 // silent
115 }
116 }
117 }
118
119 /**
120 * Called before each (logical) line is processed, giving a change to modify
121 * it (typically for cleaning dirty files). To be overridden, return the
122 * line unchanged by default. Skip the line if 'null' is returned.
123 */
124 protected String preProcessLine(String line) {
125 return line;
126 }
127
128 /**
129 * Parses a line character by character for performance purpose
130 *
131 * @return whether to continue parsing this line
132 */
133 protected Boolean parseLine(String str, List<String> tokens,
134 StringBuffer currStr, Boolean wasInquote) {
135 // List<String> tokens = new ArrayList<String>();
136
137 // System.out.println("#LINE: " + str);
138
139 if (wasInquote)
140 currStr.append('\n');
141
142 char[] arr = str.toCharArray();
143 boolean inQuote = wasInquote;
144 // StringBuffer currStr = new StringBuffer("");
145 for (int i = 0; i < arr.length; i++) {
146 char c = arr[i];
147 if (c == separator) {
148 if (!inQuote) {
149 tokens.add(currStr.toString());
150 // System.out.println("# TOKEN: " + currStr);
151 currStr.delete(0, currStr.length());
152 } else {
153 // we don't remove separator that are in a quoted substring
154 // System.out
155 // .println("IN QUOTE, got a separator: [" + c + "]");
156 currStr.append(c);
157 }
158 } else if (c == quote) {
159 if (inQuote && (i + 1) < arr.length && arr[i + 1] == quote) {
160 // case of double quote
161 currStr.append(quote);
162 i++;
163 } else {// standard
164 inQuote = inQuote ? false : true;
165 }
166 } else {
167 currStr.append(c);
168 }
169 }
170
171 if (!inQuote) {
172 tokens.add(currStr.toString());
173 // System.out.println("# TOKEN: " + currStr);
174 }
175 // if (inQuote)
176 // throw new ArgeoException("Missing quote at the end of the line "
177 // + str + " (parsed: " + tokens + ")");
178 if (inQuote)
179 return true;
180 else
181 return false;
182 // return tokens;
183 }
184
185 public char getSeparator() {
186 return separator;
187 }
188
189 public synchronized void setSeparator(char separator) {
190 this.separator = separator;
191 }
192
193 public char getQuote() {
194 return quote;
195 }
196
197 public synchronized void setQuote(char quote) {
198 this.quote = quote;
199 }
200
201 public Boolean getNoHeader() {
202 return noHeader;
203 }
204
205 public synchronized void setNoHeader(Boolean noHeader) {
206 this.noHeader = noHeader;
207 }
208
209 public Boolean getStrictLineAsLongAsHeader() {
210 return strictLineAsLongAsHeader;
211 }
212
213 public synchronized void setStrictLineAsLongAsHeader(
214 Boolean strictLineAsLongAsHeader) {
215 this.strictLineAsLongAsHeader = strictLineAsLongAsHeader;
216 }
217
218 }