org.argeo.app.core/src/org/argeo/app/library/DocxExtractor.java

   1 package org.argeo.app.library;
   2
   3 import java.io.ByteArrayInputStream;
   4 import java.io.ByteArrayOutputStream;
   5 import java.io.IOException;
   6 import java.io.InputStreamReader;
   7 import java.io.Reader;
   8 import java.nio.charset.StandardCharsets;
   9 import java.nio.file.Files;
  10 import java.nio.file.Path;
  11 import java.nio.file.Paths;
  12 import java.util.ArrayList;
  13 import java.util.HashSet;
  14 import java.util.List;
  15 import java.util.Map;
  16 import java.util.Set;
  17 import java.util.TreeMap;
  18 import java.util.zip.ZipEntry;
  19 import java.util.zip.ZipInputStream;
  20
  21 import javax.xml.parsers.ParserConfigurationException;
  22 import javax.xml.parsers.SAXParser;
  23 import javax.xml.parsers.SAXParserFactory;
  24
  25 import org.argeo.util.DigestUtils;
  26 import org.xml.sax.Attributes;
  27 import org.xml.sax.InputSource;
  28 import org.xml.sax.SAXException;
  29 import org.xml.sax.XMLReader;
  30 import org.xml.sax.helpers.DefaultHandler;
  31
  32 /** Parses a .docx document, trying its best to extract text and table data. */
  33 public class DocxExtractor {
  34         final static String T = "t";
  35         final static String TC = "tc";
  36         final static String TR = "tr";
  37         final static String TBL = "tbl";
  38         final static String P = "p";
  39         static boolean debug = false;
  40
  41         final static String PROOF_ERR = "proofErr";
  42         final static String TYPE = "type";
  43         final static String SPELL_START = "spellStart";
  44         final static String SPELL_END = "spellEnd";
  45
  46         protected List<Tbl> tables = new ArrayList<>();
  47         protected List<String> text = new ArrayList<>();
  48         protected Map<String, byte[]> media = new TreeMap<>();
  49         private Set<String> mediaDigests = new HashSet<>();
  50
  51         protected void processTextItem(List<String> lines, String str) {
  52                 lines.add(str);
  53         }
  54
  55         protected boolean skipMedia(String digest) {
  56                 return false;
  57         }
  58
  59         class DocxHandler extends DefaultHandler {
  60
  61                 private StringBuilder buffer = new StringBuilder();
  62                 private Tbl currentTbl = null;
  63
  64                 boolean inSpellErr = false;
  65                 boolean inParagraph = false;
  66
  67                 @Override
  68                 public void startElement(String uri, String name, String qName, Attributes attributes) throws SAXException {
  69                         // System.out.println(localName + " " + qName + " " + uri.hashCode());
  70                         if (P.equals(name)) {
  71                                 if (debug && currentTbl == null)
  72                                         System.out.println("# START PARA");
  73                                 inParagraph = true;
  74                         } else if (PROOF_ERR.equals(name)) {
  75                                 String type = attributes.getValue(uri, TYPE);
  76                                 if (SPELL_START.equals(type))
  77                                         inSpellErr = true;
  78                                 else if (SPELL_END.equals(type))
  79                                         inSpellErr = false;
  80
  81                         } else if (TBL.equals(name)) {
  82                                 if (currentTbl != null) {
  83                                         Tbl childTbl = new Tbl();
  84                                         childTbl.parentTbl = currentTbl;
  85                                         currentTbl = childTbl;
  86                                         // throw new IllegalStateException("Already an active table");
  87                                 } else {
  88                                         currentTbl = new Tbl();
  89                                 }
  90                         }
  91                 }
  92
  93                 @Override
  94                 public void endElement(String uri, String name, String qName) throws SAXException {
  95                         if (name.equals(T)) {
  96 //                              if (inSpellErr) {
  97 //                                      // do not reset the buffer
  98 //                                      return;
  99 //                              }
 100
 101                                 if (currentTbl != null) {
 102                                         currentTbl.appendText(buffer.toString());
 103                                 } else {
 104                                         String str = buffer.toString();
 105                                         // replace NO-BREAK SPACE by regular space.
 106                                         str = str.replace('\u00A0', ' ');
 107                                         str = str.strip();
 108                                         if (!"".equals(str)) {
 109                                                 processTextItem(text, str);
 110                                         }
 111                                 }
 112                         } else if (name.equals(P)) {
 113                                 if (debug && currentTbl == null)
 114                                         System.out.println("# END PARA");
 115                                 if (currentTbl != null) {
 116                                         currentTbl.currentRow.current.text.append('\n');
 117                                 } else {
 118
 119                                 }
 120                                 inParagraph = false;
 121                         } else if (name.equals(TC)) {
 122                                 if (currentTbl != null)
 123                                         currentTbl.closeColumn();
 124                         } else if (name.equals(TR)) {
 125                                 if (currentTbl != null)
 126                                         currentTbl.closeRow();
 127                         } else if (name.equals(TBL)) {
 128                                 if (currentTbl != null) {
 129                                         tables.add(currentTbl);
 130                                         if (currentTbl.parentTbl != null)
 131                                                 currentTbl = currentTbl.parentTbl;
 132                                         else
 133                                                 currentTbl = null;
 134                                 } else {
 135                                         throw new IllegalStateException("Closing a table while none was open.");
 136                                 }
 137                         }
 138                         // reset the buffer
 139                         buffer.setLength(0);
 140                 }
 141
 142                 @Override
 143                 public void characters(char[] ch, int start, int length) throws SAXException {
 144                         buffer.append(ch, start, length);
 145                 }
 146
 147         }
 148
 149         public static class Tbl {
 150                 Tbl parentTbl = null;
 151                 Tr currentRow = new Tr();
 152                 List<Tr> rows = new ArrayList<>();
 153
 154                 void appendText(String str) {
 155                         currentRow.current.text.append(str);
 156                 }
 157
 158                 void closeColumn() {
 159                         currentRow.columns.add(currentRow.current);
 160                         currentRow.current = new Tc();
 161                 }
 162
 163                 void closeRow() {
 164                         rows.add(currentRow);
 165                         currentRow = new Tr();
 166                 }
 167
 168                 public List<Tr> getRows() {
 169                         return rows;
 170                 }
 171
 172                 @Override
 173                 public String toString() {
 174                         StringBuilder sb = new StringBuilder();
 175                         for (Tr tr : rows) {
 176                                 String txt = tr.toString();
 177                                 sb.append(txt).append('\n');
 178                         }
 179                         return sb.toString();
 180                 }
 181         }
 182
 183         public static class Tr {
 184                 Tc current = new Tc();
 185                 List<Tc> columns = new ArrayList<>();
 186
 187                 @Override
 188                 public String toString() {
 189                         StringBuilder sb = new StringBuilder();
 190                         for (Tc tc : columns) {
 191                                 sb.append("\"").append(tc.toString()).append("\"").append(',');
 192                         }
 193                         return sb.toString();
 194                 }
 195
 196                 public List<Tc> getColumns() {
 197                         return columns;
 198                 }
 199
 200         }
 201
 202         public static class Tc {
 203                 StringBuilder text = new StringBuilder();
 204
 205                 @Override
 206                 public String toString() {
 207                         return text.toString().trim();
 208                 }
 209
 210         }
 211
 212         protected void parse(Reader in) {
 213                 try {
 214                         SAXParserFactory spf = SAXParserFactory.newInstance();
 215                         spf.setNamespaceAware(true);
 216                         SAXParser saxParser = spf.newSAXParser();
 217                         XMLReader xmlReader = saxParser.getXMLReader();
 218                         xmlReader.setContentHandler(new DocxHandler());
 219                         xmlReader.parse(new InputSource(in));
 220                 } catch (ParserConfigurationException | SAXException | IOException e) {
 221                         throw new RuntimeException("Cannot parse document", e);
 222                 }
 223         }
 224
 225         public List<String> getText() {
 226                 return text;
 227         }
 228
 229         public List<Tbl> getTables() {
 230                 return tables;
 231         }
 232
 233         public Map<String, byte[]> getMedia() {
 234                 return media;
 235         }
 236
 237         public void load(ZipInputStream zIn) {
 238                 try {
 239                         ZipEntry entry = null;
 240                         while ((entry = zIn.getNextEntry()) != null) {
 241                                 if ("word/document.xml".equals(entry.getName())) {
 242                                         try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
 243                                                 byte[] buffer = new byte[2048];
 244                                                 int len = 0;
 245                                                 while ((len = zIn.read(buffer)) > 0) {
 246                                                         out.write(buffer, 0, len);
 247                                                 }
 248                                                 try (Reader reader = new InputStreamReader(new ByteArrayInputStream(out.toByteArray()),
 249                                                                 StandardCharsets.UTF_8)) {
 250                                                         parse(reader);
 251                                                 }
 252                                         }
 253                                 } else if (entry.getName().startsWith("word/media")) {
 254                                         String fileName = entry.getName().substring(entry.getName().lastIndexOf('/') + 1);
 255                                         int dotIndex = fileName.lastIndexOf('.');
 256                                         String ext = fileName.substring(dotIndex + 1).toLowerCase();
 257                                         // we ignore .jfif
 258                                         if ("jpeg".equals(ext))
 259                                                 ext = "jpg";
 260                                         fileName = fileName.substring(0, dotIndex) + "." + ext;
 261                                         switch (ext) {
 262                                         case "png":
 263                                         case "jpg":
 264                                         case "gif":
 265                                         case "bmp":
 266                                         case "tiff":
 267                                                 try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
 268                                                         byte[] buffer = new byte[2048];
 269                                                         int len = 0;
 270                                                         while ((len = zIn.read(buffer)) > 0) {
 271                                                                 out.write(buffer, 0, len);
 272                                                         }
 273                                                         byte[] bytes = out.toByteArray();
 274                                                         String digest = DigestUtils.digest(DigestUtils.MD5, bytes);
 275                                                         if (skipMedia(digest))
 276                                                                 break;
 277                                                         if (!mediaDigests.contains(digest)) {
 278                                                                 media.put(fileName, bytes);
 279                                                                 mediaDigests.add(digest);
 280                                                         }
 281                                                 }
 282                                                 break;
 283                                         default:
 284                                                 break;
 285                                         }
 286                                 } else {
 287                                         // System.out.println(entry.getName());
 288                                 }
 289                         }
 290                 } catch (IOException e) {
 291                         // TODO Auto-generated catch block
 292                         e.printStackTrace();
 293                 }
 294                 // throw new IllegalArgumentException("No document.xml found");
 295
 296         }
 297
 298 //      public static Reader extractDocumentXml(ZipInputStream zIn) throws IOException {
 299 //              ZipEntry entry = null;
 300 //              while ((entry = zIn.getNextEntry()) != null) {
 301 //                      if ("word/document.xml".equals(entry.getName())) {
 302 //                              try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
 303 //                                      byte[] buffer = new byte[2048];
 304 //                                      int len = 0;
 305 //                                      while ((len = zIn.read(buffer)) > 0) {
 306 //                                              out.write(buffer, 0, len);
 307 //                                      }
 308 //                                      return new InputStreamReader(new ByteArrayInputStream(out.toByteArray()), StandardCharsets.UTF_8);
 309 //                              }
 310 //                      } else {
 311 //                              System.out.println(entry.getName());
 312 //                      }
 313 //              }
 314 //              throw new IllegalArgumentException("No document.xml found");
 315 //      }
 316
 317 //      protected static ZipInputStream openAsZip(String file) throws IOException {
 318 //              ZipInputStream zIn;
 319 //              Path path = Paths.get(file);
 320 //              zIn = new ZipInputStream(Files.newInputStream(path));
 321 //              return zIn;
 322 //      }
 323
 324         public static void main(String[] args) throws IOException {
 325                 if (args.length == 0)
 326                         throw new IllegalArgumentException("Provide a file path");
 327                 Path p = Paths.get(args[0]);
 328
 329                 DocxExtractor importer = new DocxExtractor();
 330                 try (ZipInputStream zIn = new ZipInputStream(Files.newInputStream(p))) {
 331                         importer.load(zIn);
 332                 }
 333                 // display
 334                 System.out.println("## TEXT");
 335                 for (int i = 0; i < importer.text.size(); i++) {
 336                         String str = importer.text.get(i);
 337                         System.out.println(str);
 338                 }
 339
 340                 System.out.println("\n");
 341
 342                 for (int i = 0; i < importer.tables.size(); i++) {
 343                         Tbl tbl = importer.tables.get(i);
 344                         System.out.println("## TABLE " + i);
 345                         System.out.println(tbl);
 346                 }
 347
 348                 System.out.println("## MEDIA");
 349                 for (String fileName : importer.media.keySet()) {
 350                         int sizeKb = importer.media.get(fileName).length / 1024;
 351                         System.out.println(fileName + " " + sizeKb + " kB");
 352                 }
 353         }
 354
 355 }