--- /dev/null
+package org.argeo.slc.diff;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Parses a string as a vector of strings according to a separator, dealing
+ * properly with missing values. This is intended to be used instead of the
+ * standard StringTokenizer, which does not deal well with empty values.
+ * Contrary to the StringTokenizer the provided String is parsed in the
+ * constructor and the values stored as a property. This should therefore not be
+ * used to parse long strings. No reference to the argument passed in
+ * constructor is kept.
+ */
+public class LineTokenizer {
+ private final List<String> tokens;
+
+ /** Complete constructor. */
+ public LineTokenizer(String stringToParse, Character separator,
+ String noValueString) {
+ this.tokens = parse(stringToParse, separator, noValueString);
+ }
+
+ /**
+ * Parse the string as a vector of strings. Can be overridden in order to
+ * provide another implementation.
+ */
+ protected List<String> parse(final String stringToParse,
+ final char separator, final String noValueString) {
+ // Init
+ final int NULL = -1;
+ List<String> res = new ArrayList<String>();
+ final char[] array = stringToParse.toCharArray();
+ int lastSeparatorIndex = NULL;
+
+ // Loop on chars
+ for (int currIndex = 0; currIndex < array.length; currIndex++) {
+ char c = array[currIndex];
+ if (c == separator) {
+ if (currIndex == 0) {
+ // first char is a separator
+ res.add(new String(noValueString));
+ lastSeparatorIndex = 0;
+ } else if (lastSeparatorIndex == NULL) {
+ // first separator found
+ res.add(new String(array, 0, currIndex));
+ lastSeparatorIndex = currIndex;
+ } else if (lastSeparatorIndex != NULL
+ && (lastSeparatorIndex == (currIndex - 1))) {
+ // consecutive separators
+ res.add(new String(noValueString));
+ lastSeparatorIndex = currIndex;
+ } else {
+ // simple case
+ res.add(new String(array, lastSeparatorIndex + 1, currIndex
+ - lastSeparatorIndex - 1));
+ lastSeparatorIndex = currIndex;
+ }
+ }
+ }
+
+ // Finalize
+ if (lastSeparatorIndex == NULL) {
+ // no separator found
+ res.add(new String(stringToParse));
+ } else if (lastSeparatorIndex == (array.length - 1)) {
+ // last char is a separator
+ res.add(new String(noValueString));
+ } else {
+ // last token
+ res.add(new String(array, lastSeparatorIndex + 1, array.length
+ - lastSeparatorIndex - 1));
+ }
+ return res;
+ }
+
+ /** The tokens. */
+ public List<String> getTokens() {
+ return tokens;
+ }
+
+ /** Parse */
+ public static List<String> tokenize(String stringToParse,
+ Character separator, String noValueString) {
+ LineTokenizer lt = new LineTokenizer(stringToParse, separator,
+ noValueString);
+ return lt.getTokens();
+ }
+
+ /** Parse, using the empty string as no value string. */
+ public static List<String> tokenize(String stringToParse,
+ Character separator) {
+ return tokenize(stringToParse, separator, "");
+ }
+
+}
--- /dev/null
+package org.argeo.slc.diff;
+
+import java.util.List;
+
+import junit.framework.TestCase;
+
+public class LineTokenizerTest extends TestCase {
+ public void testSimple() throws Exception {
+ testAndAssert("a,b,c", new String[] { "a", "b", "c" });
+ testAndAssert("hello,bonjour,hallo,priviet", new String[] { "hello",
+ "bonjour", "hallo", "priviet" });
+ }
+
+ public void testTricky() throws Exception {
+ testAndAssert("alone", new String[] { "alone" });
+ testAndAssert("", new String[] { "" });
+
+ testAndAssert(",hello,bonjour,hallo,priviet", new String[] { "",
+ "hello", "bonjour", "hallo", "priviet" });
+ testAndAssert("hello,bonjour,,hallo,priviet", new String[] { "hello",
+ "bonjour", "", "hallo", "priviet" });
+ testAndAssert("hello,bonjour,hallo,priviet,", new String[] { "hello",
+ "bonjour", "hallo", "priviet", "" });
+ testAndAssert(",hello,,bonjour,hallo,,,,priviet,", new String[] { "",
+ "hello", "", "bonjour", "hallo", "", "", "", "priviet", "" });
+
+ testAndAssert(",,,", new String[] { "", "", "", "" });
+ }
+
+ public void testComplex() throws Exception {
+ testAndAssert("a#b#c", '#', "", new String[] { "a", "b", "c" });
+ testAndAssert("hello!bonjour!hallo!priviet", '!', "", new String[] {
+ "hello", "bonjour", "hallo", "priviet" });
+
+ testAndAssert("hello,,bonjour,,hallo,priviet", ',', "<EMPTY>",
+ new String[] { "hello", "<EMPTY>", "bonjour", "<EMPTY>",
+ "hallo", "priviet" });
+ }
+
+ private void testAndAssert(String str, String[] expected) {
+ testAndAssert(str, ',', "", expected);
+ }
+
+ private void testAndAssert(String str, Character sep, String noValueStr,
+ String[] expected) {
+ List<String> res = LineTokenizer.tokenize(str, sep, noValueStr);
+ assertEquals("Size", expected.length, res.size());
+ for (int i = 0; i < res.size(); i++) {
+ String token = res.get(i);
+ assertEquals("Value@" + i, expected[i], token);
+ }
+ }
+}