]> git.argeo.org Git - gpl/argeo-suite.git/blob - org.argeo.app.core/src/org/argeo/app/library/DocxExtractor.java
Adapt to changes in Argeo Commons.
[gpl/argeo-suite.git] / org.argeo.app.core / src / org / argeo / app / library / DocxExtractor.java
1 package org.argeo.app.library;
2
3 import java.io.ByteArrayInputStream;
4 import java.io.ByteArrayOutputStream;
5 import java.io.IOException;
6 import java.io.InputStreamReader;
7 import java.io.Reader;
8 import java.nio.charset.StandardCharsets;
9 import java.nio.file.Files;
10 import java.nio.file.Path;
11 import java.nio.file.Paths;
12 import java.util.ArrayList;
13 import java.util.HashSet;
14 import java.util.List;
15 import java.util.Map;
16 import java.util.Set;
17 import java.util.TreeMap;
18 import java.util.zip.ZipEntry;
19 import java.util.zip.ZipInputStream;
20
21 import javax.xml.parsers.ParserConfigurationException;
22 import javax.xml.parsers.SAXParser;
23 import javax.xml.parsers.SAXParserFactory;
24
25 import org.argeo.util.DigestUtils;
26 import org.xml.sax.Attributes;
27 import org.xml.sax.InputSource;
28 import org.xml.sax.SAXException;
29 import org.xml.sax.XMLReader;
30 import org.xml.sax.helpers.DefaultHandler;
31
32 /** Parses a .docx document, trying its best to extract text and table data. */
33 public class DocxExtractor {
34 final static String T = "t";
35 final static String TC = "tc";
36 final static String TR = "tr";
37 final static String TBL = "tbl";
38 final static String P = "p";
39 static boolean debug = false;
40
41 final static String PROOF_ERR = "proofErr";
42 final static String TYPE = "type";
43 final static String SPELL_START = "spellStart";
44 final static String SPELL_END = "spellEnd";
45
46 protected List<Tbl> tables = new ArrayList<>();
47 protected List<String> text = new ArrayList<>();
48 protected Map<String, byte[]> media = new TreeMap<>();
49 private Set<String> mediaDigests = new HashSet<>();
50
51 protected void processTextItem(List<String> lines, String str) {
52 lines.add(str);
53 }
54
55 protected boolean skipMedia(String digest) {
56 return false;
57 }
58
59 class DocxHandler extends DefaultHandler {
60
61 private StringBuilder buffer = new StringBuilder();
62 private Tbl currentTbl = null;
63
64 boolean inSpellErr = false;
65 boolean inParagraph = false;
66
67 @Override
68 public void startElement(String uri, String name, String qName, Attributes attributes) throws SAXException {
69 // System.out.println(localName + " " + qName + " " + uri.hashCode());
70 if (P.equals(name)) {
71 if (debug && currentTbl == null)
72 System.out.println("# START PARA");
73 inParagraph = true;
74 } else if (PROOF_ERR.equals(name)) {
75 String type = attributes.getValue(uri, TYPE);
76 if (SPELL_START.equals(type))
77 inSpellErr = true;
78 else if (SPELL_END.equals(type))
79 inSpellErr = false;
80
81 } else if (TBL.equals(name)) {
82 if (currentTbl != null) {
83 Tbl childTbl = new Tbl();
84 childTbl.parentTbl = currentTbl;
85 currentTbl = childTbl;
86 // throw new IllegalStateException("Already an active table");
87 } else {
88 currentTbl = new Tbl();
89 }
90 }
91 }
92
93 @Override
94 public void endElement(String uri, String name, String qName) throws SAXException {
95 if (name.equals(T)) {
96 // if (inSpellErr) {
97 // // do not reset the buffer
98 // return;
99 // }
100
101 if (currentTbl != null) {
102 currentTbl.appendText(buffer.toString());
103 } else {
104 String str = buffer.toString();
105 // replace NO-BREAK SPACE by regular space.
106 str = str.replace('\u00A0', ' ');
107 str = str.strip();
108 if (!"".equals(str)) {
109 processTextItem(text, str);
110 }
111 }
112 } else if (name.equals(P)) {
113 if (debug && currentTbl == null)
114 System.out.println("# END PARA");
115 if (currentTbl != null) {
116 currentTbl.currentRow.current.text.append('\n');
117 } else {
118
119 }
120 inParagraph = false;
121 } else if (name.equals(TC)) {
122 if (currentTbl != null)
123 currentTbl.closeColumn();
124 } else if (name.equals(TR)) {
125 if (currentTbl != null)
126 currentTbl.closeRow();
127 } else if (name.equals(TBL)) {
128 if (currentTbl != null) {
129 tables.add(currentTbl);
130 if (currentTbl.parentTbl != null)
131 currentTbl = currentTbl.parentTbl;
132 else
133 currentTbl = null;
134 } else {
135 throw new IllegalStateException("Closing a table while none was open.");
136 }
137 }
138 // reset the buffer
139 buffer.setLength(0);
140 }
141
142 @Override
143 public void characters(char[] ch, int start, int length) throws SAXException {
144 buffer.append(ch, start, length);
145 }
146
147 }
148
149 public static class Tbl {
150 Tbl parentTbl = null;
151 Tr currentRow = new Tr();
152 List<Tr> rows = new ArrayList<>();
153
154 void appendText(String str) {
155 currentRow.current.text.append(str);
156 }
157
158 void closeColumn() {
159 currentRow.columns.add(currentRow.current);
160 currentRow.current = new Tc();
161 }
162
163 void closeRow() {
164 rows.add(currentRow);
165 currentRow = new Tr();
166 }
167
168 public List<Tr> getRows() {
169 return rows;
170 }
171
172 @Override
173 public String toString() {
174 StringBuilder sb = new StringBuilder();
175 for (Tr tr : rows) {
176 String txt = tr.toString();
177 sb.append(txt).append('\n');
178 }
179 return sb.toString();
180 }
181 }
182
183 public static class Tr {
184 Tc current = new Tc();
185 List<Tc> columns = new ArrayList<>();
186
187 @Override
188 public String toString() {
189 StringBuilder sb = new StringBuilder();
190 for (Tc tc : columns) {
191 sb.append("\"").append(tc.toString()).append("\"").append(',');
192 }
193 return sb.toString();
194 }
195
196 public List<Tc> getColumns() {
197 return columns;
198 }
199
200 }
201
202 public static class Tc {
203 StringBuilder text = new StringBuilder();
204
205 @Override
206 public String toString() {
207 return text.toString().trim();
208 }
209
210 }
211
212 protected void parse(Reader in) {
213 try {
214 SAXParserFactory spf = SAXParserFactory.newInstance();
215 spf.setNamespaceAware(true);
216 SAXParser saxParser = spf.newSAXParser();
217 XMLReader xmlReader = saxParser.getXMLReader();
218 xmlReader.setContentHandler(new DocxHandler());
219 xmlReader.parse(new InputSource(in));
220 } catch (ParserConfigurationException | SAXException | IOException e) {
221 throw new RuntimeException("Cannot parse document", e);
222 }
223 }
224
225 public List<String> getText() {
226 return text;
227 }
228
229 public List<Tbl> getTables() {
230 return tables;
231 }
232
233 public Map<String, byte[]> getMedia() {
234 return media;
235 }
236
237 public void load(ZipInputStream zIn) {
238 try {
239 ZipEntry entry = null;
240 while ((entry = zIn.getNextEntry()) != null) {
241 if ("word/document.xml".equals(entry.getName())) {
242 try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
243 byte[] buffer = new byte[2048];
244 int len = 0;
245 while ((len = zIn.read(buffer)) > 0) {
246 out.write(buffer, 0, len);
247 }
248 try (Reader reader = new InputStreamReader(new ByteArrayInputStream(out.toByteArray()),
249 StandardCharsets.UTF_8)) {
250 parse(reader);
251 }
252 }
253 } else if (entry.getName().startsWith("word/media")) {
254 String fileName = entry.getName().substring(entry.getName().lastIndexOf('/') + 1);
255 int dotIndex = fileName.lastIndexOf('.');
256 String ext = fileName.substring(dotIndex + 1).toLowerCase();
257 // we ignore .jfif
258 if ("jpeg".equals(ext))
259 ext = "jpg";
260 fileName = fileName.substring(0, dotIndex) + "." + ext;
261 switch (ext) {
262 case "png":
263 case "jpg":
264 case "gif":
265 case "bmp":
266 case "tiff":
267 try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
268 byte[] buffer = new byte[2048];
269 int len = 0;
270 while ((len = zIn.read(buffer)) > 0) {
271 out.write(buffer, 0, len);
272 }
273 byte[] bytes = out.toByteArray();
274 String digest = DigestUtils.digest(DigestUtils.MD5, bytes);
275 if (skipMedia(digest))
276 break;
277 if (!mediaDigests.contains(digest)) {
278 media.put(fileName, bytes);
279 mediaDigests.add(digest);
280 }
281 }
282 break;
283 default:
284 break;
285 }
286 } else {
287 // System.out.println(entry.getName());
288 }
289 }
290 } catch (IOException e) {
291 // TODO Auto-generated catch block
292 e.printStackTrace();
293 }
294 // throw new IllegalArgumentException("No document.xml found");
295
296 }
297
298 // public static Reader extractDocumentXml(ZipInputStream zIn) throws IOException {
299 // ZipEntry entry = null;
300 // while ((entry = zIn.getNextEntry()) != null) {
301 // if ("word/document.xml".equals(entry.getName())) {
302 // try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
303 // byte[] buffer = new byte[2048];
304 // int len = 0;
305 // while ((len = zIn.read(buffer)) > 0) {
306 // out.write(buffer, 0, len);
307 // }
308 // return new InputStreamReader(new ByteArrayInputStream(out.toByteArray()), StandardCharsets.UTF_8);
309 // }
310 // } else {
311 // System.out.println(entry.getName());
312 // }
313 // }
314 // throw new IllegalArgumentException("No document.xml found");
315 // }
316
317 // protected static ZipInputStream openAsZip(String file) throws IOException {
318 // ZipInputStream zIn;
319 // Path path = Paths.get(file);
320 // zIn = new ZipInputStream(Files.newInputStream(path));
321 // return zIn;
322 // }
323
324 public static void main(String[] args) throws IOException {
325 if (args.length == 0)
326 throw new IllegalArgumentException("Provide a file path");
327 Path p = Paths.get(args[0]);
328
329 DocxExtractor importer = new DocxExtractor();
330 try (ZipInputStream zIn = new ZipInputStream(Files.newInputStream(p))) {
331 importer.load(zIn);
332 }
333 // display
334 System.out.println("## TEXT");
335 for (int i = 0; i < importer.text.size(); i++) {
336 String str = importer.text.get(i);
337 System.out.println(str);
338 }
339
340 System.out.println("\n");
341
342 for (int i = 0; i < importer.tables.size(); i++) {
343 Tbl tbl = importer.tables.get(i);
344 System.out.println("## TABLE " + i);
345 System.out.println(tbl);
346 }
347
348 System.out.println("## MEDIA");
349 for (String fileName : importer.media.keySet()) {
350 int sizeKb = importer.media.get(fileName).length / 1024;
351 System.out.println(fileName + " " + sizeKb + " kB");
352 }
353 }
354
355 }