Read Horrible Word (.doc) Content using Apache POI Library

Apache POI is the best library I found so far which could be compatible with the old Microsoft .doc file (Of course, it also handles .docx and other MS formats). By the org.apache.poi.hwpf package you could read the content in a .doc file, although It would not be as perfect as you wish in some certain situations.

The weirdest problem I’ve encountered was that when I attempted to extract text from a Word file, some characters were swallowed strangely and the console terminal beeped. It took me a while to find out that it was probably because it recognized certain characters as control characters (like the 0x07 BEL character). The workaround to this problem is to replace the control characters in the text from HWPFDocument, use .text().replaceAll("\\p{Cntrl}", ""))

Here is a sample code to read a .doc file.

public static void main(String[] args) {
    if (args.length == 0) {
        System.err.println("Which doc file you want to read?");
        return ;
    }
    
    String filePath = args[0];
    
    try (HWPFDocument hwpf = new HWPFDocument(new FileInputStream(filePath));
    ) {
        System.out.println("============= Method 1: WordToTextConverter =============");
        System.out.println(WordToTextConverter.getText(new File(filePath)));

        System.out.println("============= Method 2: WordExtractor =============");
        WordExtractor wordExtractor = new WordExtractor(hwpf);
        System.out.println(wordExtractor.getText()); // based on WordToTextConverter, but will include headers and footers.
        wordExtractor.close();
        
        System.out.println("============= Method 3: TableIterator (Only Table) ============="); 
        Range range = hwpf.getRange();  // getOverallRange() will include headers and footers.
        // System.out.println(range.text()); // DO NOT USE this method, the output is not accurate. 
        
        TableIterator tableIterator = new TableIterator(range);
        while (tableIterator.hasNext()) {
            Table table = tableIterator.next();
            System.out.println("=== [Table] ===\n" + table.text());
        }

        System.out.println("============= Method 4: Paragraph & Table ============="); 
        StringBuilder stringBuilder = new StringBuilder();
        for (int i = 0; i < range.numParagraphs(); i++) {
            
            Paragraph para = range.getParagraph(i);
            // every table cell is a para... It's a little weird =. =
            // System.out.println(String.format("Para[%d], %s", i, para.text()));

            if (para.isInTable()) { 
                try {
                    Table table = range.getTable(para); // only first para in table could get table.
                    
                    for (int rowIdx = 0; rowIdx < table.numRows(); rowIdx++) {
                        TableRow row = table.getRow(rowIdx);
                        
                        for (int colIdx = 0; colIdx < row.numCells(); colIdx++) {
                            TableCell cell = row.getCell(colIdx);

                            stringBuilder.append(String.format("[%d, %d]%s | ", rowIdx, colIdx, cell.text().replaceAll("\\p{Cntrl}", "")));
                            // the text extract from HWPFDocument may has some weird characters like BEL control character.
                        }
                        stringBuilder.append("\n");
                    }

                } catch (IllegalArgumentException e) {
                    // Do nothing.
                    // Range.getTable(para) will throw exception if a para is not the first para in a table.
                }
            } else {
                stringBuilder.append("[not table] " + para.text() + "\n");
            }
        }
        System.out.println(stringBuilder.toString());

    } catch (Exception e) {
        e.printStackTrace();
    }
}

Leave a Comment Cancel Reply