]> git.argeo.org Git - lgpl/argeo-commons.git/blob - org.argeo.util/src/org/argeo/util/CsvParser.java
[maven-release-plugin] prepare release argeo-commons-2.1.44
[lgpl/argeo-commons.git] / org.argeo.util / src / org / argeo / util / CsvParser.java
1 /*
2 * Copyright (C) 2007-2012 Argeo GmbH
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.argeo.util;
17
18 import java.io.BufferedReader;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.io.InputStreamReader;
22 import java.util.ArrayList;
23 import java.util.Collections;
24 import java.util.List;
25
26 import org.argeo.ArgeoException;
27 import org.argeo.StreamUtils;
28
29 /**
30 * Parses a CSV file interpreting the first line as a header. The
31 * {@link #parse(InputStream)} method and the setters are synchronized so that
32 * the object cannot be modified when parsing.
33 */
34 public abstract class CsvParser {
35 private char separator = ',';
36 private char quote = '\"';
37
38 private Boolean noHeader = false;
39 private Boolean strictLineAsLongAsHeader = true;
40
41 /**
42 * Actually process a parsed line. If
43 * {@link #setStrictLineAsLongAsHeader(Boolean)} is true (default) the
44 * header and the tokens are guaranteed to have the same size.
45 *
46 * @param lineNumber
47 * the current line number, starts at 1 (the header, if header
48 * processing is enabled, the first line otherwise)
49 * @param header
50 * the read-only header or null if {@link #setNoHeader(Boolean)}
51 * is true (default is false)
52 * @param tokens
53 * the parsed tokens
54 */
55 protected abstract void processLine(Integer lineNumber,
56 List<String> header, List<String> tokens);
57
58 /**
59 * Parses the CSV file (stream is closed at the end)
60 */
61 public synchronized void parse(InputStream in) {
62 parse(in, null);
63 }
64
65 /**
66 * Parses the CSV file (stream is closed at the end)
67 */
68 public synchronized void parse(InputStream in, String encoding) {
69 BufferedReader reader = null;
70 Integer lineCount = 0;
71 try {
72 if (encoding == null)
73 reader = new BufferedReader(new InputStreamReader(in));
74 else
75 reader = new BufferedReader(new InputStreamReader(in, encoding));
76 List<String> header = null;
77 if (!noHeader) {
78 String headerStr = reader.readLine();
79 if (headerStr == null)// empty file
80 return;
81 lineCount++;
82 header = new ArrayList<String>();
83 StringBuffer currStr = new StringBuffer("");
84 Boolean wasInquote = false;
85 while (parseLine(headerStr, header, currStr, wasInquote)) {
86 headerStr = reader.readLine();
87 if (headerStr == null)
88 break;
89 wasInquote = true;
90 }
91 header = Collections.unmodifiableList(header);
92 }
93
94 String line = null;
95 lines: while ((line = reader.readLine()) != null) {
96 line = preProcessLine(line);
97 if (line == null) {
98 // skip line
99 continue lines;
100 }
101 lineCount++;
102 List<String> tokens = new ArrayList<String>();
103 StringBuffer currStr = new StringBuffer("");
104 Boolean wasInquote = false;
105 sublines: while (parseLine(line, tokens, currStr, wasInquote)) {
106 line = reader.readLine();
107 if (line == null)
108 break sublines;
109 wasInquote = true;
110 }
111 if (!noHeader && strictLineAsLongAsHeader) {
112 int headerSize = header.size();
113 int tokenSize = tokens.size();
114 if (tokenSize == 1 && line.trim().equals(""))
115 continue lines;// empty line
116 if (headerSize != tokenSize) {
117 throw new ArgeoException("Token size " + tokenSize
118 + " is different from header size "
119 + headerSize + " at line " + lineCount
120 + ", line: " + line + ", header: " + header
121 + ", tokens: " + tokens);
122 }
123 }
124 processLine(lineCount, header, tokens);
125 }
126 } catch (ArgeoException e) {
127 throw e;
128 } catch (IOException e) {
129 throw new ArgeoException("Cannot parse CSV file (line: "
130 + lineCount + ")", e);
131 } finally {
132 StreamUtils.closeQuietly(reader);
133 }
134 }
135
136 /**
137 * Called before each (logical) line is processed, giving a change to modify
138 * it (typically for cleaning dirty files). To be overridden, return the
139 * line unchanged by default. Skip the line if 'null' is returned.
140 */
141 protected String preProcessLine(String line) {
142 return line;
143 }
144
145 /**
146 * Parses a line character by character for performance purpose
147 *
148 * @return whether to continue parsing this line
149 */
150 protected Boolean parseLine(String str, List<String> tokens,
151 StringBuffer currStr, Boolean wasInquote) {
152 // List<String> tokens = new ArrayList<String>();
153
154 // System.out.println("#LINE: " + str);
155
156 if (wasInquote)
157 currStr.append('\n');
158
159 char[] arr = str.toCharArray();
160 boolean inQuote = wasInquote;
161 // StringBuffer currStr = new StringBuffer("");
162 for (int i = 0; i < arr.length; i++) {
163 char c = arr[i];
164 if (c == separator) {
165 if (!inQuote) {
166 tokens.add(currStr.toString());
167 // System.out.println("# TOKEN: " + currStr);
168 currStr.delete(0, currStr.length());
169 } else {
170 // we don't remove separator that are in a quoted substring
171 // System.out
172 // .println("IN QUOTE, got a separator: [" + c + "]");
173 currStr.append(c);
174 }
175 } else if (c == quote) {
176 if (inQuote && (i + 1) < arr.length && arr[i + 1] == quote) {
177 // case of double quote
178 currStr.append(quote);
179 i++;
180 } else {// standard
181 inQuote = inQuote ? false : true;
182 }
183 } else {
184 currStr.append(c);
185 }
186 }
187
188 if (!inQuote) {
189 tokens.add(currStr.toString());
190 // System.out.println("# TOKEN: " + currStr);
191 }
192 // if (inQuote)
193 // throw new ArgeoException("Missing quote at the end of the line "
194 // + str + " (parsed: " + tokens + ")");
195 if (inQuote)
196 return true;
197 else
198 return false;
199 // return tokens;
200 }
201
202 public char getSeparator() {
203 return separator;
204 }
205
206 public synchronized void setSeparator(char separator) {
207 this.separator = separator;
208 }
209
210 public char getQuote() {
211 return quote;
212 }
213
214 public synchronized void setQuote(char quote) {
215 this.quote = quote;
216 }
217
218 public Boolean getNoHeader() {
219 return noHeader;
220 }
221
222 public synchronized void setNoHeader(Boolean noHeader) {
223 this.noHeader = noHeader;
224 }
225
226 public Boolean getStrictLineAsLongAsHeader() {
227 return strictLineAsLongAsHeader;
228 }
229
230 public synchronized void setStrictLineAsLongAsHeader(
231 Boolean strictLineAsLongAsHeader) {
232 this.strictLineAsLongAsHeader = strictLineAsLongAsHeader;
233 }
234
235 }