/*
 * Copyright (C) 2003-2004 Red Hat Inc. All Rights Reserved.
 *
 * The contents of this file are subject to the CCM Public
 * License (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the
 * License at http://www.redhat.com/licenses/ccmpl.html.
 *
 * Software distributed under the License is distributed on an
 * "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express
 * or implied. See the License for the specific language
 * governing rights and limitations under the License.
 *
 */
package com.arsdigita.search.converter;

import java.io.InputStream;
import java.io.IOException;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.util.PDFTextStripper;
import org.pdfbox.pdmodel.PDDocument;

/**
 * This class provides the mechanism to perform a conversion from
 * PDF to standard text
 */
public class PDFConverter extends BaseConverter {
    

    /**
     *  Returns a string array representing all of the files types
     *  used by the given converter
     */
    protected String[] getFileExtensions() {
        String[] extensions = {"pdf"};
        return extensions;
    }

    /**
     *  This takes in a document represented as an InputStream and returns
     *  a text representation of that document.
     */
    public String convertDocument(InputStream stream) throws ConversionException {
        try {
            PDFParser parser = new PDFParser(stream);
            parser.parse();
            PDDocument pdDocument= parser.getPDDocument();
            String strippedText = (new PDFTextStripper()).getText(pdDocument);
            pdDocument.close();
            StringBuffer buf = new StringBuffer();
            byte[] bytes = strippedText.getBytes();
            // TODO: make sure that looking byte to byte is ok...I think
            // that we actually need to look multiple bytes at a time
            // so that we don't loose utf-8 items
            for (int i = 0; i < bytes.length; i++) {
                byte[] byteArray = new byte[1];
                byteArray[0] = bytes[i];
                String ch = new String(byteArray);
                if (!"0".equals(Byte.toString(bytes[i]))) {
                    buf.append(ch);
                }
            }
            //return strippedText;
            return buf.toString();
        } catch (IOException ioe) {
            throw new ConversionException(ioe);
        }
    }

} 
