X-Git-Url: https://git.argeo.org/?p=gpl%2Fargeo-suite.git;a=blobdiff_plain;f=org.argeo.suite.core%2Fsrc%2Forg%2Fargeo%2Fsuite%2Flibrary%2FDocxExtractor.java;fp=org.argeo.suite.core%2Fsrc%2Forg%2Fargeo%2Fsuite%2Flibrary%2FDocxExtractor.java;h=0000000000000000000000000000000000000000;hp=53e73f33181850191b900f1dd94369c9a4f3057a;hb=6e56ffa34cb02ab04d028423aea342e3dfed4358;hpb=c285180bece610b2c2921d44fe14b6dde2123efa diff --git a/org.argeo.suite.core/src/org/argeo/suite/library/DocxExtractor.java b/org.argeo.suite.core/src/org/argeo/suite/library/DocxExtractor.java deleted file mode 100644 index 53e73f3..0000000 --- a/org.argeo.suite.core/src/org/argeo/suite/library/DocxExtractor.java +++ /dev/null @@ -1,355 +0,0 @@ -package org.argeo.suite.library; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.Reader; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeMap; -import java.util.zip.ZipEntry; -import java.util.zip.ZipInputStream; - -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; - -import org.argeo.util.DigestUtils; -import org.xml.sax.Attributes; -import org.xml.sax.InputSource; -import org.xml.sax.SAXException; -import org.xml.sax.XMLReader; -import org.xml.sax.helpers.DefaultHandler; - -/** Parses a .docx document, trying its best to extract text and table data. */ -public class DocxExtractor { - final static String T = "t"; - final static String TC = "tc"; - final static String TR = "tr"; - final static String TBL = "tbl"; - final static String P = "p"; - static boolean debug = false; - - final static String PROOF_ERR = "proofErr"; - final static String TYPE = "type"; - final static String SPELL_START = "spellStart"; - final static String SPELL_END = "spellEnd"; - - protected List tables = new ArrayList<>(); - protected List text = new ArrayList<>(); - protected Map media = new TreeMap<>(); - private Set mediaDigests = new HashSet<>(); - - protected void processTextItem(List lines, String str) { - lines.add(str); - } - - protected boolean skipMedia(String digest) { - return false; - } - - class DocxHandler extends DefaultHandler { - - private StringBuilder buffer = new StringBuilder(); - private Tbl currentTbl = null; - - boolean inSpellErr = false; - boolean inParagraph = false; - - @Override - public void startElement(String uri, String name, String qName, Attributes attributes) throws SAXException { - // System.out.println(localName + " " + qName + " " + uri.hashCode()); - if (P.equals(name)) { - if (debug && currentTbl == null) - System.out.println("# START PARA"); - inParagraph = true; - } else if (PROOF_ERR.equals(name)) { - String type = attributes.getValue(uri, TYPE); - if (SPELL_START.equals(type)) - inSpellErr = true; - else if (SPELL_END.equals(type)) - inSpellErr = false; - - } else if (TBL.equals(name)) { - if (currentTbl != null) { - Tbl childTbl = new Tbl(); - childTbl.parentTbl = currentTbl; - currentTbl = childTbl; - // throw new IllegalStateException("Already an active table"); - } else { - currentTbl = new Tbl(); - } - } - } - - @Override - public void endElement(String uri, String name, String qName) throws SAXException { - if (name.equals(T)) { -// if (inSpellErr) { -// // do not reset the buffer -// return; -// } - - if (currentTbl != null) { - currentTbl.appendText(buffer.toString()); - } else { - String str = buffer.toString(); - // replace NO-BREAK SPACE by regular space. - str = str.replace('\u00A0', ' '); - str = str.strip(); - if (!"".equals(str)) { - processTextItem(text, str); - } - } - } else if (name.equals(P)) { - if (debug && currentTbl == null) - System.out.println("# END PARA"); - if (currentTbl != null) { - currentTbl.currentRow.current.text.append('\n'); - } else { - - } - inParagraph = false; - } else if (name.equals(TC)) { - if (currentTbl != null) - currentTbl.closeColumn(); - } else if (name.equals(TR)) { - if (currentTbl != null) - currentTbl.closeRow(); - } else if (name.equals(TBL)) { - if (currentTbl != null) { - tables.add(currentTbl); - if (currentTbl.parentTbl != null) - currentTbl = currentTbl.parentTbl; - else - currentTbl = null; - } else { - throw new IllegalStateException("Closing a table while none was open."); - } - } - // reset the buffer - buffer.setLength(0); - } - - @Override - public void characters(char[] ch, int start, int length) throws SAXException { - buffer.append(ch, start, length); - } - - } - - public static class Tbl { - Tbl parentTbl = null; - Tr currentRow = new Tr(); - List rows = new ArrayList<>(); - - void appendText(String str) { - currentRow.current.text.append(str); - } - - void closeColumn() { - currentRow.columns.add(currentRow.current); - currentRow.current = new Tc(); - } - - void closeRow() { - rows.add(currentRow); - currentRow = new Tr(); - } - - public List getRows() { - return rows; - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - for (Tr tr : rows) { - String txt = tr.toString(); - sb.append(txt).append('\n'); - } - return sb.toString(); - } - } - - public static class Tr { - Tc current = new Tc(); - List columns = new ArrayList<>(); - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - for (Tc tc : columns) { - sb.append("\"").append(tc.toString()).append("\"").append(','); - } - return sb.toString(); - } - - public List getColumns() { - return columns; - } - - } - - public static class Tc { - StringBuilder text = new StringBuilder(); - - @Override - public String toString() { - return text.toString().trim(); - } - - } - - protected void parse(Reader in) { - try { - SAXParserFactory spf = SAXParserFactory.newInstance(); - spf.setNamespaceAware(true); - SAXParser saxParser = spf.newSAXParser(); - XMLReader xmlReader = saxParser.getXMLReader(); - xmlReader.setContentHandler(new DocxHandler()); - xmlReader.parse(new InputSource(in)); - } catch (ParserConfigurationException | SAXException | IOException e) { - throw new RuntimeException("Cannot parse document", e); - } - } - - public List getText() { - return text; - } - - public List getTables() { - return tables; - } - - public Map getMedia() { - return media; - } - - public void load(ZipInputStream zIn) { - try { - ZipEntry entry = null; - while ((entry = zIn.getNextEntry()) != null) { - if ("word/document.xml".equals(entry.getName())) { - try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { - byte[] buffer = new byte[2048]; - int len = 0; - while ((len = zIn.read(buffer)) > 0) { - out.write(buffer, 0, len); - } - try (Reader reader = new InputStreamReader(new ByteArrayInputStream(out.toByteArray()), - StandardCharsets.UTF_8)) { - parse(reader); - } - } - } else if (entry.getName().startsWith("word/media")) { - String fileName = entry.getName().substring(entry.getName().lastIndexOf('/') + 1); - int dotIndex = fileName.lastIndexOf('.'); - String ext = fileName.substring(dotIndex + 1).toLowerCase(); - // we ignore .jfif - if ("jpeg".equals(ext)) - ext = "jpg"; - fileName = fileName.substring(0, dotIndex) + "." + ext; - switch (ext) { - case "png": - case "jpg": - case "gif": - case "bmp": - case "tiff": - try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { - byte[] buffer = new byte[2048]; - int len = 0; - while ((len = zIn.read(buffer)) > 0) { - out.write(buffer, 0, len); - } - byte[] bytes = out.toByteArray(); - String digest = DigestUtils.digest(DigestUtils.MD5, bytes); - if (skipMedia(digest)) - break; - if (!mediaDigests.contains(digest)) { - media.put(fileName, bytes); - mediaDigests.add(digest); - } - } - break; - default: - break; - } - } else { - // System.out.println(entry.getName()); - } - } - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - // throw new IllegalArgumentException("No document.xml found"); - - } - -// public static Reader extractDocumentXml(ZipInputStream zIn) throws IOException { -// ZipEntry entry = null; -// while ((entry = zIn.getNextEntry()) != null) { -// if ("word/document.xml".equals(entry.getName())) { -// try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { -// byte[] buffer = new byte[2048]; -// int len = 0; -// while ((len = zIn.read(buffer)) > 0) { -// out.write(buffer, 0, len); -// } -// return new InputStreamReader(new ByteArrayInputStream(out.toByteArray()), StandardCharsets.UTF_8); -// } -// } else { -// System.out.println(entry.getName()); -// } -// } -// throw new IllegalArgumentException("No document.xml found"); -// } - -// protected static ZipInputStream openAsZip(String file) throws IOException { -// ZipInputStream zIn; -// Path path = Paths.get(file); -// zIn = new ZipInputStream(Files.newInputStream(path)); -// return zIn; -// } - - public static void main(String[] args) throws IOException { - if (args.length == 0) - throw new IllegalArgumentException("Provide a file path"); - Path p = Paths.get(args[0]); - - DocxExtractor importer = new DocxExtractor(); - try (ZipInputStream zIn = new ZipInputStream(Files.newInputStream(p))) { - importer.load(zIn); - } - // display - System.out.println("## TEXT"); - for (int i = 0; i < importer.text.size(); i++) { - String str = importer.text.get(i); - System.out.println(str); - } - - System.out.println("\n"); - - for (int i = 0; i < importer.tables.size(); i++) { - Tbl tbl = importer.tables.get(i); - System.out.println("## TABLE " + i); - System.out.println(tbl); - } - - System.out.println("## MEDIA"); - for (String fileName : importer.media.keySet()) { - int sizeKb = importer.media.get(fileName).length / 1024; - System.out.println(fileName + " " + sizeKb + " kB"); - } - } - -}