-package org.argeo.suite.library;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeMap;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipInputStream;
-
-import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.parsers.SAXParser;
-import javax.xml.parsers.SAXParserFactory;
-
-import org.argeo.util.DigestUtils;
-import org.xml.sax.Attributes;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-import org.xml.sax.XMLReader;
-import org.xml.sax.helpers.DefaultHandler;
-
-/** Parses a .docx document, trying its best to extract text and table data. */
-public class DocxExtractor {
- final static String T = "t";
- final static String TC = "tc";
- final static String TR = "tr";
- final static String TBL = "tbl";
- final static String P = "p";
- static boolean debug = false;
-
- final static String PROOF_ERR = "proofErr";
- final static String TYPE = "type";
- final static String SPELL_START = "spellStart";
- final static String SPELL_END = "spellEnd";
-
- protected List<Tbl> tables = new ArrayList<>();
- protected List<String> text = new ArrayList<>();
- protected Map<String, byte[]> media = new TreeMap<>();
- private Set<String> mediaDigests = new HashSet<>();
-
- protected void processTextItem(List<String> lines, String str) {
- lines.add(str);
- }
-
- protected boolean skipMedia(String digest) {
- return false;
- }
-
- class DocxHandler extends DefaultHandler {
-
- private StringBuilder buffer = new StringBuilder();
- private Tbl currentTbl = null;
-
- boolean inSpellErr = false;
- boolean inParagraph = false;
-
- @Override
- public void startElement(String uri, String name, String qName, Attributes attributes) throws SAXException {
- // System.out.println(localName + " " + qName + " " + uri.hashCode());
- if (P.equals(name)) {
- if (debug && currentTbl == null)
- System.out.println("# START PARA");
- inParagraph = true;
- } else if (PROOF_ERR.equals(name)) {
- String type = attributes.getValue(uri, TYPE);
- if (SPELL_START.equals(type))
- inSpellErr = true;
- else if (SPELL_END.equals(type))
- inSpellErr = false;
-
- } else if (TBL.equals(name)) {
- if (currentTbl != null) {
- Tbl childTbl = new Tbl();
- childTbl.parentTbl = currentTbl;
- currentTbl = childTbl;
- // throw new IllegalStateException("Already an active table");
- } else {
- currentTbl = new Tbl();
- }
- }
- }
-
- @Override
- public void endElement(String uri, String name, String qName) throws SAXException {
- if (name.equals(T)) {
-// if (inSpellErr) {
-// // do not reset the buffer
-// return;
-// }
-
- if (currentTbl != null) {
- currentTbl.appendText(buffer.toString());
- } else {
- String str = buffer.toString();
- // replace NO-BREAK SPACE by regular space.
- str = str.replace('\u00A0', ' ');
- str = str.strip();
- if (!"".equals(str)) {
- processTextItem(text, str);
- }
- }
- } else if (name.equals(P)) {
- if (debug && currentTbl == null)
- System.out.println("# END PARA");
- if (currentTbl != null) {
- currentTbl.currentRow.current.text.append('\n');
- } else {
-
- }
- inParagraph = false;
- } else if (name.equals(TC)) {
- if (currentTbl != null)
- currentTbl.closeColumn();
- } else if (name.equals(TR)) {
- if (currentTbl != null)
- currentTbl.closeRow();
- } else if (name.equals(TBL)) {
- if (currentTbl != null) {
- tables.add(currentTbl);
- if (currentTbl.parentTbl != null)
- currentTbl = currentTbl.parentTbl;
- else
- currentTbl = null;
- } else {
- throw new IllegalStateException("Closing a table while none was open.");
- }
- }
- // reset the buffer
- buffer.setLength(0);
- }
-
- @Override
- public void characters(char[] ch, int start, int length) throws SAXException {
- buffer.append(ch, start, length);
- }
-
- }
-
- public static class Tbl {
- Tbl parentTbl = null;
- Tr currentRow = new Tr();
- List<Tr> rows = new ArrayList<>();
-
- void appendText(String str) {
- currentRow.current.text.append(str);
- }
-
- void closeColumn() {
- currentRow.columns.add(currentRow.current);
- currentRow.current = new Tc();
- }
-
- void closeRow() {
- rows.add(currentRow);
- currentRow = new Tr();
- }
-
- public List<Tr> getRows() {
- return rows;
- }
-
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder();
- for (Tr tr : rows) {
- String txt = tr.toString();
- sb.append(txt).append('\n');
- }
- return sb.toString();
- }
- }
-
- public static class Tr {
- Tc current = new Tc();
- List<Tc> columns = new ArrayList<>();
-
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder();
- for (Tc tc : columns) {
- sb.append("\"").append(tc.toString()).append("\"").append(',');
- }
- return sb.toString();
- }
-
- public List<Tc> getColumns() {
- return columns;
- }
-
- }
-
- public static class Tc {
- StringBuilder text = new StringBuilder();
-
- @Override
- public String toString() {
- return text.toString().trim();
- }
-
- }
-
- protected void parse(Reader in) {
- try {
- SAXParserFactory spf = SAXParserFactory.newInstance();
- spf.setNamespaceAware(true);
- SAXParser saxParser = spf.newSAXParser();
- XMLReader xmlReader = saxParser.getXMLReader();
- xmlReader.setContentHandler(new DocxHandler());
- xmlReader.parse(new InputSource(in));
- } catch (ParserConfigurationException | SAXException | IOException e) {
- throw new RuntimeException("Cannot parse document", e);
- }
- }
-
- public List<String> getText() {
- return text;
- }
-
- public List<Tbl> getTables() {
- return tables;
- }
-
- public Map<String, byte[]> getMedia() {
- return media;
- }
-
- public void load(ZipInputStream zIn) {
- try {
- ZipEntry entry = null;
- while ((entry = zIn.getNextEntry()) != null) {
- if ("word/document.xml".equals(entry.getName())) {
- try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
- byte[] buffer = new byte[2048];
- int len = 0;
- while ((len = zIn.read(buffer)) > 0) {
- out.write(buffer, 0, len);
- }
- try (Reader reader = new InputStreamReader(new ByteArrayInputStream(out.toByteArray()),
- StandardCharsets.UTF_8)) {
- parse(reader);
- }
- }
- } else if (entry.getName().startsWith("word/media")) {
- String fileName = entry.getName().substring(entry.getName().lastIndexOf('/') + 1);
- int dotIndex = fileName.lastIndexOf('.');
- String ext = fileName.substring(dotIndex + 1).toLowerCase();
- // we ignore .jfif
- if ("jpeg".equals(ext))
- ext = "jpg";
- fileName = fileName.substring(0, dotIndex) + "." + ext;
- switch (ext) {
- case "png":
- case "jpg":
- case "gif":
- case "bmp":
- case "tiff":
- try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
- byte[] buffer = new byte[2048];
- int len = 0;
- while ((len = zIn.read(buffer)) > 0) {
- out.write(buffer, 0, len);
- }
- byte[] bytes = out.toByteArray();
- String digest = DigestUtils.digest(DigestUtils.MD5, bytes);
- if (skipMedia(digest))
- break;
- if (!mediaDigests.contains(digest)) {
- media.put(fileName, bytes);
- mediaDigests.add(digest);
- }
- }
- break;
- default:
- break;
- }
- } else {
- // System.out.println(entry.getName());
- }
- }
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- // throw new IllegalArgumentException("No document.xml found");
-
- }
-
-// public static Reader extractDocumentXml(ZipInputStream zIn) throws IOException {
-// ZipEntry entry = null;
-// while ((entry = zIn.getNextEntry()) != null) {
-// if ("word/document.xml".equals(entry.getName())) {
-// try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
-// byte[] buffer = new byte[2048];
-// int len = 0;
-// while ((len = zIn.read(buffer)) > 0) {
-// out.write(buffer, 0, len);
-// }
-// return new InputStreamReader(new ByteArrayInputStream(out.toByteArray()), StandardCharsets.UTF_8);
-// }
-// } else {
-// System.out.println(entry.getName());
-// }
-// }
-// throw new IllegalArgumentException("No document.xml found");
-// }
-
-// protected static ZipInputStream openAsZip(String file) throws IOException {
-// ZipInputStream zIn;
-// Path path = Paths.get(file);
-// zIn = new ZipInputStream(Files.newInputStream(path));
-// return zIn;
-// }
-
- public static void main(String[] args) throws IOException {
- if (args.length == 0)
- throw new IllegalArgumentException("Provide a file path");
- Path p = Paths.get(args[0]);
-
- DocxExtractor importer = new DocxExtractor();
- try (ZipInputStream zIn = new ZipInputStream(Files.newInputStream(p))) {
- importer.load(zIn);
- }
- // display
- System.out.println("## TEXT");
- for (int i = 0; i < importer.text.size(); i++) {
- String str = importer.text.get(i);
- System.out.println(str);
- }
-
- System.out.println("\n");
-
- for (int i = 0; i < importer.tables.size(); i++) {
- Tbl tbl = importer.tables.get(i);
- System.out.println("## TABLE " + i);
- System.out.println(tbl);
- }
-
- System.out.println("## MEDIA");
- for (String fileName : importer.media.keySet()) {
- int sizeKb = importer.media.get(fileName).length / 1024;
- System.out.println(fileName + " " + sizeKb + " kB");
- }
- }
-
-}