]> git.argeo.org Git - lgpl/argeo-commons.git/blob - org.argeo.util/src/org/argeo/util/CsvParser.java
Make OSGi boot more robust against badly formatted bundles.
[lgpl/argeo-commons.git] / org.argeo.util / src / org / argeo / util / CsvParser.java
1 /*
2 * Copyright (C) 2007-2012 Argeo GmbH
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.argeo.util;
17
18 import java.io.BufferedReader;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.io.InputStreamReader;
22 import java.util.ArrayList;
23 import java.util.Collections;
24 import java.util.List;
25
26 /**
27 * Parses a CSV file interpreting the first line as a header. The
28 * {@link #parse(InputStream)} method and the setters are synchronized so that
29 * the object cannot be modified when parsing.
30 */
31 public abstract class CsvParser {
32 private char separator = ',';
33 private char quote = '\"';
34
35 private Boolean noHeader = false;
36 private Boolean strictLineAsLongAsHeader = true;
37
38 /**
39 * Actually process a parsed line. If
40 * {@link #setStrictLineAsLongAsHeader(Boolean)} is true (default) the
41 * header and the tokens are guaranteed to have the same size.
42 *
43 * @param lineNumber
44 * the current line number, starts at 1 (the header, if header
45 * processing is enabled, the first line otherwise)
46 * @param header
47 * the read-only header or null if {@link #setNoHeader(Boolean)}
48 * is true (default is false)
49 * @param tokens
50 * the parsed tokens
51 */
52 protected abstract void processLine(Integer lineNumber,
53 List<String> header, List<String> tokens);
54
55 /**
56 * Parses the CSV file (stream is closed at the end)
57 */
58 public synchronized void parse(InputStream in) {
59 parse(in, null);
60 }
61
62 /**
63 * Parses the CSV file (stream is closed at the end)
64 */
65 public synchronized void parse(InputStream in, String encoding) {
66 BufferedReader reader = null;
67 Integer lineCount = 0;
68 try {
69 if (encoding == null)
70 reader = new BufferedReader(new InputStreamReader(in));
71 else
72 reader = new BufferedReader(new InputStreamReader(in, encoding));
73 List<String> header = null;
74 if (!noHeader) {
75 String headerStr = reader.readLine();
76 if (headerStr == null)// empty file
77 return;
78 lineCount++;
79 header = new ArrayList<String>();
80 StringBuffer currStr = new StringBuffer("");
81 Boolean wasInquote = false;
82 while (parseLine(headerStr, header, currStr, wasInquote)) {
83 headerStr = reader.readLine();
84 if (headerStr == null)
85 break;
86 wasInquote = true;
87 }
88 header = Collections.unmodifiableList(header);
89 }
90
91 String line = null;
92 lines: while ((line = reader.readLine()) != null) {
93 line = preProcessLine(line);
94 if (line == null) {
95 // skip line
96 continue lines;
97 }
98 lineCount++;
99 List<String> tokens = new ArrayList<String>();
100 StringBuffer currStr = new StringBuffer("");
101 Boolean wasInquote = false;
102 sublines: while (parseLine(line, tokens, currStr, wasInquote)) {
103 line = reader.readLine();
104 if (line == null)
105 break sublines;
106 wasInquote = true;
107 }
108 if (!noHeader && strictLineAsLongAsHeader) {
109 int headerSize = header.size();
110 int tokenSize = tokens.size();
111 if (tokenSize == 1 && line.trim().equals(""))
112 continue lines;// empty line
113 if (headerSize != tokenSize) {
114 throw new UtilsException("Token size " + tokenSize
115 + " is different from header size "
116 + headerSize + " at line " + lineCount
117 + ", line: " + line + ", header: " + header
118 + ", tokens: " + tokens);
119 }
120 }
121 processLine(lineCount, header, tokens);
122 }
123 } catch (UtilsException e) {
124 throw e;
125 } catch (IOException e) {
126 throw new UtilsException("Cannot parse CSV file (line: "
127 + lineCount + ")", e);
128 } finally {
129 StreamUtils.closeQuietly(reader);
130 }
131 }
132
133 /**
134 * Called before each (logical) line is processed, giving a change to modify
135 * it (typically for cleaning dirty files). To be overridden, return the
136 * line unchanged by default. Skip the line if 'null' is returned.
137 */
138 protected String preProcessLine(String line) {
139 return line;
140 }
141
142 /**
143 * Parses a line character by character for performance purpose
144 *
145 * @return whether to continue parsing this line
146 */
147 protected Boolean parseLine(String str, List<String> tokens,
148 StringBuffer currStr, Boolean wasInquote) {
149 // List<String> tokens = new ArrayList<String>();
150
151 // System.out.println("#LINE: " + str);
152
153 if (wasInquote)
154 currStr.append('\n');
155
156 char[] arr = str.toCharArray();
157 boolean inQuote = wasInquote;
158 // StringBuffer currStr = new StringBuffer("");
159 for (int i = 0; i < arr.length; i++) {
160 char c = arr[i];
161 if (c == separator) {
162 if (!inQuote) {
163 tokens.add(currStr.toString());
164 // System.out.println("# TOKEN: " + currStr);
165 currStr.delete(0, currStr.length());
166 } else {
167 // we don't remove separator that are in a quoted substring
168 // System.out
169 // .println("IN QUOTE, got a separator: [" + c + "]");
170 currStr.append(c);
171 }
172 } else if (c == quote) {
173 if (inQuote && (i + 1) < arr.length && arr[i + 1] == quote) {
174 // case of double quote
175 currStr.append(quote);
176 i++;
177 } else {// standard
178 inQuote = inQuote ? false : true;
179 }
180 } else {
181 currStr.append(c);
182 }
183 }
184
185 if (!inQuote) {
186 tokens.add(currStr.toString());
187 // System.out.println("# TOKEN: " + currStr);
188 }
189 // if (inQuote)
190 // throw new ArgeoException("Missing quote at the end of the line "
191 // + str + " (parsed: " + tokens + ")");
192 if (inQuote)
193 return true;
194 else
195 return false;
196 // return tokens;
197 }
198
199 public char getSeparator() {
200 return separator;
201 }
202
203 public synchronized void setSeparator(char separator) {
204 this.separator = separator;
205 }
206
207 public char getQuote() {
208 return quote;
209 }
210
211 public synchronized void setQuote(char quote) {
212 this.quote = quote;
213 }
214
215 public Boolean getNoHeader() {
216 return noHeader;
217 }
218
219 public synchronized void setNoHeader(Boolean noHeader) {
220 this.noHeader = noHeader;
221 }
222
223 public Boolean getStrictLineAsLongAsHeader() {
224 return strictLineAsLongAsHeader;
225 }
226
227 public synchronized void setStrictLineAsLongAsHeader(
228 Boolean strictLineAsLongAsHeader) {
229 this.strictLineAsLongAsHeader = strictLineAsLongAsHeader;
230 }
231
232 }