]> git.argeo.org Git - lgpl/argeo-commons.git/blob - CsvParser.java
127d0f50928f9b358c5f376a9ba8704f9d9ba697
[lgpl/argeo-commons.git] / CsvParser.java
1 /*
2 * Copyright (C) 2007-2012 Mathieu Baudier
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.argeo.util;
17
18 import java.io.BufferedReader;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.io.InputStreamReader;
22 import java.util.ArrayList;
23 import java.util.Collections;
24 import java.util.List;
25
26 import org.argeo.ArgeoException;
27
28 /**
29 * Parses a CSV file interpreting the first line as a header. The
30 * {@link #parse(InputStream)} method and the setters are synchronized so that
31 * the object cannot be modified when parsing.
32 */
33 public abstract class CsvParser {
34 private char separator = ',';
35 private char quote = '\"';
36
37 private Boolean noHeader = false;
38 private Boolean strictLineAsLongAsHeader = true;
39
40 /**
41 * Actually process a parsed line. If
42 * {@link #setStrictLineAsLongAsHeader(Boolean)} is true (default) the
43 * header and the tokens are guaranteed to have the same size.
44 *
45 * @param lineNumber
46 * the current line number, starts at 1 (the header, if header
47 * processing is enabled, the first line otherwise)
48 * @param header
49 * the read-only header or null if {@link #setNoHeader(Boolean)}
50 * is true (default is false)
51 * @param tokens
52 * the parsed tokens
53 */
54 protected abstract void processLine(Integer lineNumber,
55 List<String> header, List<String> tokens);
56
57 public synchronized void parse(InputStream in) {
58 parse(in, null);
59 }
60
61 public synchronized void parse(InputStream in, String encoding) {
62 BufferedReader reader = null;
63 Integer lineCount = 0;
64 try {
65 if (encoding == null)
66 reader = new BufferedReader(new InputStreamReader(in));
67 else
68 reader = new BufferedReader(new InputStreamReader(in, encoding));
69 List<String> header = null;
70 if (!noHeader) {
71 String headerStr = reader.readLine();
72 if (headerStr == null)// empty file
73 return;
74 lineCount++;
75 header = new ArrayList<String>();
76 StringBuffer currStr = new StringBuffer("");
77 Boolean wasInquote = false;
78 while (parseLine(headerStr, header, currStr, wasInquote)) {
79 headerStr = reader.readLine();
80 if (headerStr == null)
81 break;
82 wasInquote = true;
83 }
84 header = Collections.unmodifiableList(header);
85 }
86
87 String line = null;
88 lines: while ((line = reader.readLine()) != null) {
89 line = preProcessLine(line);
90 if (line == null) {
91 // skip line
92 continue lines;
93 }
94 lineCount++;
95 List<String> tokens = new ArrayList<String>();
96 StringBuffer currStr = new StringBuffer("");
97 Boolean wasInquote = false;
98 sublines: while (parseLine(line, tokens, currStr, wasInquote)) {
99 line = reader.readLine();
100 if (line == null)
101 break sublines;
102 wasInquote = true;
103 }
104 if (!noHeader && strictLineAsLongAsHeader) {
105 int headerSize = header.size();
106 int tokenSize = tokens.size();
107 if (tokenSize == 1 && line.trim().equals(""))
108 continue lines;// empty line
109 if (headerSize != tokenSize) {
110 throw new ArgeoException("Token size " + tokenSize
111 + " is different from header size "
112 + headerSize + " at line " + lineCount
113 + ", line: " + line + ", header: " + header
114 + ", tokens: " + tokens);
115 }
116 }
117 processLine(lineCount, header, tokens);
118 }
119 } catch (ArgeoException e) {
120 throw e;
121 } catch (IOException e) {
122 throw new ArgeoException("Cannot parse CSV file (line: "
123 + lineCount + ")", e);
124 } finally {
125 if (reader != null)
126 try {
127 reader.close();
128 } catch (Exception e2) {
129 // silent
130 }
131 }
132 }
133
134 /**
135 * Called before each (logical) line is processed, giving a change to modify
136 * it (typically for cleaning dirty files). To be overridden, return the
137 * line unchanged by default. Skip the line if 'null' is returned.
138 */
139 protected String preProcessLine(String line) {
140 return line;
141 }
142
143 /**
144 * Parses a line character by character for performance purpose
145 *
146 * @return whether to continue parsing this line
147 */
148 protected Boolean parseLine(String str, List<String> tokens,
149 StringBuffer currStr, Boolean wasInquote) {
150 // List<String> tokens = new ArrayList<String>();
151
152 // System.out.println("#LINE: " + str);
153
154 if (wasInquote)
155 currStr.append('\n');
156
157 char[] arr = str.toCharArray();
158 boolean inQuote = wasInquote;
159 // StringBuffer currStr = new StringBuffer("");
160 for (int i = 0; i < arr.length; i++) {
161 char c = arr[i];
162 if (c == separator) {
163 if (!inQuote) {
164 tokens.add(currStr.toString());
165 // System.out.println("# TOKEN: " + currStr);
166 currStr.delete(0, currStr.length());
167 } else {
168 // we don't remove separator that are in a quoted substring
169 // System.out
170 // .println("IN QUOTE, got a separator: [" + c + "]");
171 currStr.append(c);
172 }
173 } else if (c == quote) {
174 if (inQuote && (i + 1) < arr.length && arr[i + 1] == quote) {
175 // case of double quote
176 currStr.append(quote);
177 i++;
178 } else {// standard
179 inQuote = inQuote ? false : true;
180 }
181 } else {
182 currStr.append(c);
183 }
184 }
185
186 if (!inQuote) {
187 tokens.add(currStr.toString());
188 // System.out.println("# TOKEN: " + currStr);
189 }
190 // if (inQuote)
191 // throw new ArgeoException("Missing quote at the end of the line "
192 // + str + " (parsed: " + tokens + ")");
193 if (inQuote)
194 return true;
195 else
196 return false;
197 // return tokens;
198 }
199
200 public char getSeparator() {
201 return separator;
202 }
203
204 public synchronized void setSeparator(char separator) {
205 this.separator = separator;
206 }
207
208 public char getQuote() {
209 return quote;
210 }
211
212 public synchronized void setQuote(char quote) {
213 this.quote = quote;
214 }
215
216 public Boolean getNoHeader() {
217 return noHeader;
218 }
219
220 public synchronized void setNoHeader(Boolean noHeader) {
221 this.noHeader = noHeader;
222 }
223
224 public Boolean getStrictLineAsLongAsHeader() {
225 return strictLineAsLongAsHeader;
226 }
227
228 public synchronized void setStrictLineAsLongAsHeader(
229 Boolean strictLineAsLongAsHeader) {
230 this.strictLineAsLongAsHeader = strictLineAsLongAsHeader;
231 }
232
233 }