1 package org
.argeo
.app
.library
;
3 import java
.io
.ByteArrayInputStream
;
4 import java
.io
.ByteArrayOutputStream
;
5 import java
.io
.IOException
;
6 import java
.io
.InputStreamReader
;
8 import java
.nio
.charset
.StandardCharsets
;
9 import java
.nio
.file
.Files
;
10 import java
.nio
.file
.Path
;
11 import java
.nio
.file
.Paths
;
12 import java
.util
.ArrayList
;
13 import java
.util
.HashSet
;
14 import java
.util
.List
;
17 import java
.util
.TreeMap
;
18 import java
.util
.zip
.ZipEntry
;
19 import java
.util
.zip
.ZipInputStream
;
21 import javax
.xml
.parsers
.ParserConfigurationException
;
22 import javax
.xml
.parsers
.SAXParser
;
23 import javax
.xml
.parsers
.SAXParserFactory
;
25 import org
.argeo
.cms
.util
.DigestUtils
;
26 import org
.xml
.sax
.Attributes
;
27 import org
.xml
.sax
.InputSource
;
28 import org
.xml
.sax
.SAXException
;
29 import org
.xml
.sax
.XMLReader
;
30 import org
.xml
.sax
.helpers
.DefaultHandler
;
32 /** Parses a .docx document, trying its best to extract text and table data. */
33 public class DocxExtractor
{
34 final static String T
= "t";
35 final static String TC
= "tc";
36 final static String TR
= "tr";
37 final static String TBL
= "tbl";
38 final static String P
= "p";
39 static boolean debug
= false;
41 final static String PROOF_ERR
= "proofErr";
42 final static String TYPE
= "type";
43 final static String SPELL_START
= "spellStart";
44 final static String SPELL_END
= "spellEnd";
46 protected List
<Tbl
> tables
= new ArrayList
<>();
47 protected List
<String
> text
= new ArrayList
<>();
48 protected Map
<String
, byte[]> media
= new TreeMap
<>();
49 private Set
<String
> mediaDigests
= new HashSet
<>();
51 protected void processTextItem(List
<String
> lines
, String str
) {
55 protected boolean skipMedia(String digest
) {
59 class DocxHandler
extends DefaultHandler
{
61 private StringBuilder buffer
= new StringBuilder();
62 private Tbl currentTbl
= null;
64 boolean inSpellErr
= false;
65 boolean inParagraph
= false;
68 public void startElement(String uri
, String name
, String qName
, Attributes attributes
) throws SAXException
{
69 // System.out.println(localName + " " + qName + " " + uri.hashCode());
71 if (debug
&& currentTbl
== null)
72 System
.out
.println("# START PARA");
74 } else if (PROOF_ERR
.equals(name
)) {
75 String type
= attributes
.getValue(uri
, TYPE
);
76 if (SPELL_START
.equals(type
))
78 else if (SPELL_END
.equals(type
))
81 } else if (TBL
.equals(name
)) {
82 if (currentTbl
!= null) {
83 Tbl childTbl
= new Tbl();
84 childTbl
.parentTbl
= currentTbl
;
85 currentTbl
= childTbl
;
86 // throw new IllegalStateException("Already an active table");
88 currentTbl
= new Tbl();
94 public void endElement(String uri
, String name
, String qName
) throws SAXException
{
97 // // do not reset the buffer
101 if (currentTbl
!= null) {
102 currentTbl
.appendText(buffer
.toString());
104 String str
= buffer
.toString();
105 // replace NO-BREAK SPACE by regular space.
106 str
= str
.replace('\u00A0', ' ');
108 if (!"".equals(str
)) {
109 processTextItem(text
, str
);
112 } else if (name
.equals(P
)) {
113 if (debug
&& currentTbl
== null)
114 System
.out
.println("# END PARA");
115 if (currentTbl
!= null) {
116 currentTbl
.currentRow
.current
.text
.append('\n');
121 } else if (name
.equals(TC
)) {
122 if (currentTbl
!= null)
123 currentTbl
.closeColumn();
124 } else if (name
.equals(TR
)) {
125 if (currentTbl
!= null)
126 currentTbl
.closeRow();
127 } else if (name
.equals(TBL
)) {
128 if (currentTbl
!= null) {
129 tables
.add(currentTbl
);
130 if (currentTbl
.parentTbl
!= null)
131 currentTbl
= currentTbl
.parentTbl
;
135 throw new IllegalStateException("Closing a table while none was open.");
143 public void characters(char[] ch
, int start
, int length
) throws SAXException
{
144 buffer
.append(ch
, start
, length
);
149 public static class Tbl
{
150 Tbl parentTbl
= null;
151 Tr currentRow
= new Tr();
152 List
<Tr
> rows
= new ArrayList
<>();
154 void appendText(String str
) {
155 currentRow
.current
.text
.append(str
);
159 currentRow
.columns
.add(currentRow
.current
);
160 currentRow
.current
= new Tc();
164 rows
.add(currentRow
);
165 currentRow
= new Tr();
168 public List
<Tr
> getRows() {
173 public String
toString() {
174 StringBuilder sb
= new StringBuilder();
176 String txt
= tr
.toString();
177 sb
.append(txt
).append('\n');
179 return sb
.toString();
183 public static class Tr
{
184 Tc current
= new Tc();
185 List
<Tc
> columns
= new ArrayList
<>();
188 public String
toString() {
189 StringBuilder sb
= new StringBuilder();
190 for (Tc tc
: columns
) {
191 sb
.append("\"").append(tc
.toString()).append("\"").append(',');
193 return sb
.toString();
196 public List
<Tc
> getColumns() {
202 public static class Tc
{
203 StringBuilder text
= new StringBuilder();
206 public String
toString() {
207 return text
.toString().trim();
212 protected void parse(Reader in
) {
214 SAXParserFactory spf
= SAXParserFactory
.newInstance();
215 spf
.setNamespaceAware(true);
216 SAXParser saxParser
= spf
.newSAXParser();
217 XMLReader xmlReader
= saxParser
.getXMLReader();
218 xmlReader
.setContentHandler(new DocxHandler());
219 xmlReader
.parse(new InputSource(in
));
220 } catch (ParserConfigurationException
| SAXException
| IOException e
) {
221 throw new RuntimeException("Cannot parse document", e
);
225 public List
<String
> getText() {
229 public List
<Tbl
> getTables() {
233 public Map
<String
, byte[]> getMedia() {
237 public void load(ZipInputStream zIn
) {
239 ZipEntry entry
= null;
240 while ((entry
= zIn
.getNextEntry()) != null) {
241 if ("word/document.xml".equals(entry
.getName())) {
242 try (ByteArrayOutputStream out
= new ByteArrayOutputStream()) {
243 byte[] buffer
= new byte[2048];
245 while ((len
= zIn
.read(buffer
)) > 0) {
246 out
.write(buffer
, 0, len
);
248 try (Reader reader
= new InputStreamReader(new ByteArrayInputStream(out
.toByteArray()),
249 StandardCharsets
.UTF_8
)) {
253 } else if (entry
.getName().startsWith("word/media")) {
254 String fileName
= entry
.getName().substring(entry
.getName().lastIndexOf('/') + 1);
255 int dotIndex
= fileName
.lastIndexOf('.');
256 String ext
= fileName
.substring(dotIndex
+ 1).toLowerCase();
258 if ("jpeg".equals(ext
))
260 fileName
= fileName
.substring(0, dotIndex
) + "." + ext
;
267 try (ByteArrayOutputStream out
= new ByteArrayOutputStream()) {
268 byte[] buffer
= new byte[2048];
270 while ((len
= zIn
.read(buffer
)) > 0) {
271 out
.write(buffer
, 0, len
);
273 byte[] bytes
= out
.toByteArray();
274 String digest
= DigestUtils
.digest(DigestUtils
.MD5
, bytes
);
275 if (skipMedia(digest
))
277 if (!mediaDigests
.contains(digest
)) {
278 media
.put(fileName
, bytes
);
279 mediaDigests
.add(digest
);
287 // System.out.println(entry.getName());
290 } catch (IOException e
) {
291 // TODO Auto-generated catch block
294 // throw new IllegalArgumentException("No document.xml found");
298 // public static Reader extractDocumentXml(ZipInputStream zIn) throws IOException {
299 // ZipEntry entry = null;
300 // while ((entry = zIn.getNextEntry()) != null) {
301 // if ("word/document.xml".equals(entry.getName())) {
302 // try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
303 // byte[] buffer = new byte[2048];
305 // while ((len = zIn.read(buffer)) > 0) {
306 // out.write(buffer, 0, len);
308 // return new InputStreamReader(new ByteArrayInputStream(out.toByteArray()), StandardCharsets.UTF_8);
311 // System.out.println(entry.getName());
314 // throw new IllegalArgumentException("No document.xml found");
317 // protected static ZipInputStream openAsZip(String file) throws IOException {
318 // ZipInputStream zIn;
319 // Path path = Paths.get(file);
320 // zIn = new ZipInputStream(Files.newInputStream(path));
324 public static void main(String
[] args
) throws IOException
{
325 if (args
.length
== 0)
326 throw new IllegalArgumentException("Provide a file path");
327 Path p
= Paths
.get(args
[0]);
329 DocxExtractor importer
= new DocxExtractor();
330 try (ZipInputStream zIn
= new ZipInputStream(Files
.newInputStream(p
))) {
334 System
.out
.println("## TEXT");
335 for (int i
= 0; i
< importer
.text
.size(); i
++) {
336 String str
= importer
.text
.get(i
);
337 System
.out
.println(str
);
340 System
.out
.println("\n");
342 for (int i
= 0; i
< importer
.tables
.size(); i
++) {
343 Tbl tbl
= importer
.tables
.get(i
);
344 System
.out
.println("## TABLE " + i
);
345 System
.out
.println(tbl
);
348 System
.out
.println("## MEDIA");
349 for (String fileName
: importer
.media
.keySet()) {
350 int sizeKb
= importer
.media
.get(fileName
).length
/ 1024;
351 System
.out
.println(fileName
+ " " + sizeKb
+ " kB");