]> git.argeo.org Git - lgpl/argeo-commons.git/blob - basic/runtime/org.argeo.basic.nodeps/src/main/java/org/argeo/util/CsvParser.java
ca81bc98305700e3be83f5ebac44c02f8718f40b
[lgpl/argeo-commons.git] / basic / runtime / org.argeo.basic.nodeps / src / main / java / org / argeo / util / CsvParser.java
1 package org.argeo.util;
2
3 import java.io.BufferedReader;
4 import java.io.IOException;
5 import java.io.InputStream;
6 import java.io.InputStreamReader;
7 import java.util.ArrayList;
8 import java.util.Collections;
9 import java.util.List;
10
11 import org.argeo.ArgeoException;
12
13 /**
14 * Parses a CSV file interpreting the first line as a header. The
15 * {@link #parse(InputStream)} method and the setters are synchronized so that
16 * the object cannot be modified when parsing.
17 */
18 public abstract class CsvParser {
19 private char separator = ',';
20 private char quote = '\"';
21
22 private Boolean noHeader = false;
23 private Boolean strictLineAsLongAsHeader = true;
24
25 /**
26 * Actually process a parsed line. If
27 * {@link #setStrictLineAsLongAsHeader(Boolean)} is true (default) the
28 * header and the tokens are guaranteed to have the same size.
29 *
30 * @param lineNumber
31 * the current line number, starts at 1 (the header, if header
32 * processing is enabled, the first lien otherwise)
33 * @param header
34 * the read-only header or null if {@link #setNoHeader(Boolean)}
35 * is true (default is false)
36 * @param tokens
37 * the parse tokens
38 */
39 protected abstract void processLine(Integer lineNumber,
40 List<String> header, List<String> tokens);
41
42 public synchronized void parse(InputStream in) {
43 BufferedReader reader = null;
44 Integer lineCount = 0;
45 try {
46 reader = new BufferedReader(new InputStreamReader(in));
47
48 List<String> header = null;
49 if (!noHeader) {
50 String headerStr = reader.readLine();
51 if (headerStr == null)// empty file
52 return;
53 lineCount++;
54 header = new ArrayList<String>();
55 StringBuffer currStr = new StringBuffer("");
56 Boolean wasInquote = false;
57 while (parseLine(headerStr, header, currStr, wasInquote)) {
58 wasInquote = true;
59 }
60 header = Collections.unmodifiableList(header);
61 }
62
63 String line = null;
64 lines: while ((line = reader.readLine()) != null) {
65 lineCount++;
66 List<String> tokens = new ArrayList<String>();
67 StringBuffer currStr = new StringBuffer("");
68 Boolean wasInquote = false;
69 while (parseLine(line, tokens, currStr, wasInquote)) {
70 line = reader.readLine();
71 if (line == null)
72 break;
73 wasInquote = true;
74 }
75 if (!noHeader && strictLineAsLongAsHeader) {
76 int headerSize = header.size();
77 int tokenSize = tokens.size();
78 if (tokenSize == 1 && line.trim().equals(""))
79 continue lines;// empty line
80 if (headerSize != tokenSize) {
81 throw new ArgeoException("Token size " + tokenSize
82 + " is different from header size "
83 + headerSize + " at line " + lineCount
84 + ", line: " + line + ", header: " + header
85 + ", tokens: " + tokens);
86 }
87 }
88 processLine(lineCount, header, tokens);
89 }
90 } catch (ArgeoException e) {
91 throw e;
92 } catch (IOException e) {
93 throw new ArgeoException("Cannot parse CSV file (line: "
94 + lineCount + ")", e);
95 } finally {
96 if (reader != null)
97 try {
98 reader.close();
99 } catch (Exception e2) {
100 // silent
101 }
102 }
103 }
104
105 /**
106 * Parses a line character by character for performance purpose
107 *
108 * @return whether to continue parsing this line
109 */
110 protected Boolean parseLine(String str, List<String> tokens,
111 StringBuffer currStr, Boolean wasInquote) {
112 // List<String> tokens = new ArrayList<String>();
113
114 //System.out.println("#LINE: " + str);
115
116 if (wasInquote)
117 currStr.append('\n');
118
119 char[] arr = str.toCharArray();
120 boolean inQuote = wasInquote;
121 // StringBuffer currStr = new StringBuffer("");
122 for (int i = 0; i < arr.length; i++) {
123 char c = arr[i];
124 if (c == separator) {
125 if (!inQuote) {
126 tokens.add(currStr.toString());
127 //System.out.println("# TOKEN: " + currStr);
128 currStr.delete(0, currStr.length());
129 }
130 } else if (c == quote) {
131 if (inQuote && (i + 1) < arr.length && arr[i + 1] == quote) {
132 // case of double quote
133 currStr.append(quote);
134 i++;
135 } else {// standard
136 inQuote = inQuote ? false : true;
137 }
138 } else {
139 currStr.append(c);
140 }
141 }
142
143 if (!inQuote) {
144 tokens.add(currStr.toString());
145 //System.out.println("# TOKEN: " + currStr);
146 }
147 // if (inQuote)
148 // throw new ArgeoException("Missing quote at the end of the line "
149 // + str + " (parsed: " + tokens + ")");
150 if (inQuote)
151 return true;
152 else
153 return false;
154 // return tokens;
155 }
156
157 public char getSeparator() {
158 return separator;
159 }
160
161 public synchronized void setSeparator(char separator) {
162 this.separator = separator;
163 }
164
165 public char getQuote() {
166 return quote;
167 }
168
169 public synchronized void setQuote(char quote) {
170 this.quote = quote;
171 }
172
173 public Boolean getNoHeader() {
174 return noHeader;
175 }
176
177 public synchronized void setNoHeader(Boolean noHeader) {
178 this.noHeader = noHeader;
179 }
180
181 public Boolean getStrictLineAsLongAsHeader() {
182 return strictLineAsLongAsHeader;
183 }
184
185 public synchronized void setStrictLineAsLongAsHeader(
186 Boolean strictLineAsLongAsHeader) {
187 this.strictLineAsLongAsHeader = strictLineAsLongAsHeader;
188 }
189
190 }