]> git.argeo.org Git - gpl/argeo-slc.git/commitdiff
Introduce line tokenizer
authorMathieu Baudier <mbaudier@argeo.org>
Thu, 7 Feb 2008 11:15:40 +0000 (11:15 +0000)
committerMathieu Baudier <mbaudier@argeo.org>
Thu, 7 Feb 2008 11:15:40 +0000 (11:15 +0000)
git-svn-id: https://svn.argeo.org/slc/trunk@944 4cfe0d0a-d680-48aa-b62c-e0a02a3f76cc

org.argeo.slc.core/src/main/java/org/argeo/slc/diff/LineTokenizer.java [new file with mode: 0644]
org.argeo.slc.core/src/test/java/org/argeo/slc/diff/LineTokenizerTest.java [new file with mode: 0644]

diff --git a/org.argeo.slc.core/src/main/java/org/argeo/slc/diff/LineTokenizer.java b/org.argeo.slc.core/src/main/java/org/argeo/slc/diff/LineTokenizer.java
new file mode 100644 (file)
index 0000000..fed4c37
--- /dev/null
@@ -0,0 +1,96 @@
+package org.argeo.slc.diff;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Parses a string as a vector of strings according to a separator, dealing
+ * properly with missing values. This is intended to be used instead of the
+ * standard StringTokenizer, which does not deal well with empty values.
+ * Contrary to the StringTokenizer the provided String is parsed in the
+ * constructor and the values stored as a property. This should therefore not be
+ * used to parse long strings. No reference to the argument passed in
+ * constructor is kept.
+ */
+public class LineTokenizer {
+       private final List<String> tokens;
+
+       /** Complete constructor. */
+       public LineTokenizer(String stringToParse, Character separator,
+                       String noValueString) {
+               this.tokens = parse(stringToParse, separator, noValueString);
+       }
+
+       /**
+        * Parse the string as a vector of strings. Can be overridden in order to
+        * provide another implementation.
+        */
+       protected List<String> parse(final String stringToParse,
+                       final char separator, final String noValueString) {
+               // Init
+               final int NULL = -1;
+               List<String> res = new ArrayList<String>();
+               final char[] array = stringToParse.toCharArray();
+               int lastSeparatorIndex = NULL;
+
+               // Loop on chars
+               for (int currIndex = 0; currIndex < array.length; currIndex++) {
+                       char c = array[currIndex];
+                       if (c == separator) {
+                               if (currIndex == 0) {
+                                       // first char is a separator
+                                       res.add(new String(noValueString));
+                                       lastSeparatorIndex = 0;
+                               } else if (lastSeparatorIndex == NULL) {
+                                       // first separator found
+                                       res.add(new String(array, 0, currIndex));
+                                       lastSeparatorIndex = currIndex;
+                               } else if (lastSeparatorIndex != NULL
+                                               && (lastSeparatorIndex == (currIndex - 1))) {
+                                       // consecutive separators
+                                       res.add(new String(noValueString));
+                                       lastSeparatorIndex = currIndex;
+                               } else {
+                                       // simple case
+                                       res.add(new String(array, lastSeparatorIndex + 1, currIndex
+                                                       - lastSeparatorIndex - 1));
+                                       lastSeparatorIndex = currIndex;
+                               }
+                       }
+               }
+
+               // Finalize
+               if (lastSeparatorIndex == NULL) {
+                       // no separator found
+                       res.add(new String(stringToParse));
+               } else if (lastSeparatorIndex == (array.length - 1)) {
+                       // last char is a separator
+                       res.add(new String(noValueString));
+               } else {
+                       // last token
+                       res.add(new String(array, lastSeparatorIndex + 1, array.length
+                                       - lastSeparatorIndex - 1));
+               }
+               return res;
+       }
+
+       /** The tokens. */
+       public List<String> getTokens() {
+               return tokens;
+       }
+
+       /** Parse */
+       public static List<String> tokenize(String stringToParse,
+                       Character separator, String noValueString) {
+               LineTokenizer lt = new LineTokenizer(stringToParse, separator,
+                               noValueString);
+               return lt.getTokens();
+       }
+
+       /** Parse, using the empty string as no value string. */
+       public static List<String> tokenize(String stringToParse,
+                       Character separator) {
+               return tokenize(stringToParse, separator, "");
+       }
+
+}
diff --git a/org.argeo.slc.core/src/test/java/org/argeo/slc/diff/LineTokenizerTest.java b/org.argeo.slc.core/src/test/java/org/argeo/slc/diff/LineTokenizerTest.java
new file mode 100644 (file)
index 0000000..58ed61a
--- /dev/null
@@ -0,0 +1,53 @@
+package org.argeo.slc.diff;
+
+import java.util.List;
+
+import junit.framework.TestCase;
+
+public class LineTokenizerTest extends TestCase {
+       public void testSimple() throws Exception {
+               testAndAssert("a,b,c", new String[] { "a", "b", "c" });
+               testAndAssert("hello,bonjour,hallo,priviet", new String[] { "hello",
+                               "bonjour", "hallo", "priviet" });
+       }
+
+       public void testTricky() throws Exception {
+               testAndAssert("alone", new String[] { "alone" });
+               testAndAssert("", new String[] { "" });
+
+               testAndAssert(",hello,bonjour,hallo,priviet", new String[] { "",
+                               "hello", "bonjour", "hallo", "priviet" });
+               testAndAssert("hello,bonjour,,hallo,priviet", new String[] { "hello",
+                               "bonjour", "", "hallo", "priviet" });
+               testAndAssert("hello,bonjour,hallo,priviet,", new String[] { "hello",
+                               "bonjour", "hallo", "priviet", "" });
+               testAndAssert(",hello,,bonjour,hallo,,,,priviet,", new String[] { "",
+                               "hello", "", "bonjour", "hallo", "", "", "", "priviet", "" });
+
+               testAndAssert(",,,", new String[] { "", "", "", "" });
+       }
+
+       public void testComplex() throws Exception {
+               testAndAssert("a#b#c", '#', "", new String[] { "a", "b", "c" });
+               testAndAssert("hello!bonjour!hallo!priviet", '!', "", new String[] {
+                               "hello", "bonjour", "hallo", "priviet" });
+
+               testAndAssert("hello,,bonjour,,hallo,priviet", ',', "<EMPTY>",
+                               new String[] { "hello", "<EMPTY>", "bonjour", "<EMPTY>",
+                                               "hallo", "priviet" });
+       }
+
+       private void testAndAssert(String str, String[] expected) {
+               testAndAssert(str, ',', "", expected);
+       }
+
+       private void testAndAssert(String str, Character sep, String noValueStr,
+                       String[] expected) {
+               List<String> res = LineTokenizer.tokenize(str, sep, noValueStr);
+               assertEquals("Size", expected.length, res.size());
+               for (int i = 0; i < res.size(); i++) {
+                       String token = res.get(i);
+                       assertEquals("Value@" + i, expected[i], token);
+               }
+       }
+}