How to Skip Headers and Footers Extraction using Apache Tika - apache-tika

How to extract documents like (pdf,docx,doc,odt) without headers and footer using apache tika.

I tested this code with all the file formats, some are parsing well(pdf and html) and not working for doc,docx,xlsx,xls formats
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.metadata.Metadata;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
public class NewtikaXpath {
public static void main(String args[]) throws IOException, SAXException, TikaException {
AutoDetectParser parser = new AutoDetectParser();
ContentHandler textHandler = new BodyContentHandler();
Metadata xmetadata = new Metadata();
try (InputStream stream = TikaInputStream.get(new URL("your favourite url"))){
parser.parse(stream, new BoilerpipeContentHandler(textHandler), xmetadata);
System.out.println("text:\n" + textHandler.toString());
}
}
}

You can do it pro-grammatically. Here is how and it's working for all tika supported documents including docx, pptx, odt pdf
ParseContext parseContext = new ParseContext();
AutoDetectParser parser = new AutoDetectParser();
ContentHandler contentHandler = new BodyContentHandler();
inputStream = new BufferedInputStream(new FileInputStream(inputFileName));
Metadata metadata = new Metadata();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setIncludeHeadersAndFooters(false);
parseContext.set(OfficeParserConfig.class, officeParserConfig);
parser.parse(inputStream, contentHandler, metadata, parseContext);
System.out.println(contentHandler.toString());

Related

Itext html2pdf use converter for create personal Paragraph

But where do you change the font size of the method
HtmlConverter.convertToElements?
Why do I get this result with this code result code after execute ?
because the text between the tags does not change size ?
I also tried these solutions using the various methods (but the size does not change) :
https://stackoverflow.com/a/59044415/18323778
itext7 set font and size of HtmlConverter elements (In this question it is explained how to use the css too I am interested in changing only 2 properties of the element)
Run this code:
import com.itextpdf.html2pdf.ConverterProperties;
import com.itextpdf.html2pdf.HtmlConverter;
import com.itextpdf.html2pdf.resolver.font.DefaultFontProvider;
import com.itextpdf.io.font.constants.StandardFonts;
import com.itextpdf.kernel.font.PdfFont;
import com.itextpdf.kernel.font.PdfFontFactory;
import com.itextpdf.kernel.pdf.PdfDictionary;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfName;
import com.itextpdf.kernel.pdf.PdfWriter;
import com.itextpdf.layout.Document;
import com.itextpdf.layout.Style;
import com.itextpdf.layout.element.IElement;
import com.itextpdf.layout.element.Paragraph;
import com.itextpdf.layout.font.FontProvider;
import com.itextpdf.layout.properties.Property;
import com.itextpdf.layout.properties.UnitValue;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.List;
/**
*
* #author UC9001309
*/
public class TestConvertHtml2pdf {
/**
* #param args the command line arguments
*/
public static void main(String[] args) throws FileNotFoundException, IOException {
PdfWriter pdfWriter = new PdfWriter("C:\\Temp\\" + new SimpleDateFormat("yyyyMMddHHmmss").format(new java.util.Date()) + ".pdf");
PdfDocument pdfDocument = new PdfDocument(pdfWriter);
Document document = new Document(pdfDocument);
FontProvider provider = new DefaultFontProvider(true, false, false);
ConverterProperties cvProp = new ConverterProperties();
cvProp.setFontProvider(provider);
cvProp.setImmediateFlush(true);
List<IElement> lst = HtmlConverter.convertToElements("Testo <b><u><i>prova</i></u></b> con tag",cvProp);
lst.get(0).setProperty(Property.FONT_SIZE,UnitValue.createPointValue(Float.parseFloat("20")));
Paragraph p = (Paragraph) lst.get(0);
p.setProperty(Property.FONT_SIZE,UnitValue.createPointValue(Float.parseFloat("20")));
document.add(p);
document.close();
}
}
i tried various methods but still don't get results, does anyone know how to fix ?
Thanks for your help .
The goal is to create a method in Itext 7 to pass as parameters a string with html tag and font and font size, get it translated by the html parser to apply bold, italics, uderline without setting them with Itex commands as paragraph. selBold ec ..
Because my colleagues get tired of programming.
After several attempts I solved this (I create a main to try):
package testconverthtml2pdf;
import com.itextpdf.html2pdf.ConverterProperties;
import com.itextpdf.html2pdf.HtmlConverter;
import com.itextpdf.html2pdf.resolver.font.DefaultFontProvider;
import com.itextpdf.io.font.constants.StandardFonts;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfWriter;
import com.itextpdf.layout.Document;
import com.itextpdf.layout.element.IElement;
import com.itextpdf.layout.element.Paragraph;
import com.itextpdf.layout.font.FontProvider;
import com.itextpdf.layout.properties.UnitValue;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.List;
/**
*
* #author UC9001309
*/
public class TestConvertHtml2pdf {
/**
* #param args the command line arguments
*/
public static void main(String[] args) throws FileNotFoundException, IOException {
// Future parameters of the method change it to test !!!
float vardim = 10.5f;
String tipofont =StandardFonts.COURIER.toString().toLowerCase();
String strInput = "Testo <b><u><i>prova</i></u></b> con tag";
// -------------------------------------------------------------
PdfWriter pdfWriter = new PdfWriter("C:\\Temp\\" + new SimpleDateFormat("yyyyMMddHHmmss").format(new java.util.Date()) + ".pdf");
PdfDocument pdfDocument = new PdfDocument(pdfWriter);
Document document = new Document(pdfDocument);
FontProvider provider = new DefaultFontProvider(true, false, false);
ConverterProperties cvProp = new ConverterProperties();
cvProp.setFontProvider(provider);
cvProp.setImmediateFlush(true);
StringBuilder stringBuilder = new StringBuilder();
stringBuilder.append("<p style='font-family:"+tipofont +";font-size:"+UnitValue.createPointValue(vardim)+"'>") ;
stringBuilder.append(strInput) ;
stringBuilder.append("</p>") ;
System.out.println(stringBuilder);
List<IElement> lst = HtmlConverter.convertToElements(stringBuilder.toString(),cvProp);
Paragraph p = (Paragraph) lst.get(0);
document.add(p);
document.close();
}
}
To set Font and Font-Size to the input string I add a with a Style attribute inside for Font and Font-Size taken from parameter.
stringBuilder.append("<p style='font-family:"+tipofont +";font-size:"+UnitValue.createPointValue(vardim)+"'>") ;
tipofont -----> Font parameter
vardim -----> Font size parameter
I have to thank André Lemos who in his comment (lost because instead of modifying the question I deleted and recreated) gave me a solution because the bold was not created because the font type that had bold as its attribute was missing. I loaded Currier but not Currier-Bold.
If this solution can help anyone, it is available to the community.

Java How to format URL as a String to connect with JSoup Malformed URL error

I have a program that connects to a user defined URL from a TextField and scrapes the images on that web page. The user defined URL is gotten from the textfield via .getText() and assigned to a String. The String is then used to connect to the Web page with JSoup and puts the webpage into a document.
String address = labelforAddress.getText();
try {
document = Jsoup.connect(address).get();
}catch(IOException ex){
ex.printStackTrace();
}
I've tried differently formatted URLS: "https://www.", "www.", "https://" but everything I use throws the malformed URL error.
Someone please show me how to get the text from the TextField the correct way.
Below is the entire code.
package sample;
import javafx.application.Application;
import javafx.fxml.FXMLLoader;
import javafx.scene.Parent;
import javafx.scene.Scene;
import javafx.scene.control.Button;
import javafx.scene.control.Label;
import javafx.scene.control.TextField;
import javafx.scene.layout.GridPane;
import javafx.stage.FileChooser;
import javafx.stage.Stage;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
public class Main extends Application {
Document document;
LinkedList<String> imageURLList = new LinkedList<String>();
ArrayList<File> fileList = new ArrayList<File>();
int fileCount = 1;
#Override
public void start(Stage primaryStage) throws Exception{
Parent root = FXMLLoader.load(getClass().getResource("sample.fxml"));
primaryStage.setTitle("Webpage Photo Scraper");
GridPane gp = new GridPane();
Label labelforAddress = new Label("URL");
GridPane.setConstraints(labelforAddress, 0,0);
TextField URLAddress = new TextField();
GridPane.setConstraints(URLAddress, 1,0);
Button scrape = new Button("Scrape for Photos");
GridPane.setConstraints(scrape, 0,1);
scrape.setOnAction(event->{
String address = labelforAddress.getText();
try {
document = Jsoup.connect(address).get();
}catch(IOException ex){
ex.printStackTrace();
}
Elements imgTags = document.getElementsByAttributeValueContaining("src", "/CharacterImages");
for(Element imgTag: imgTags){
imageURLList.add(imgTag.absUrl("src"));
}
for(String url: imageURLList){
File file = new File("C:\\Users\\Andrei\\Documents\\file" + fileCount + ".txt");
downloadFromURL(url, file);
fileList.add(file);
fileCount++;
}
});
Button exportToZipFile = new Button("Export to Zip File");
GridPane.setConstraints(exportToZipFile, 0,2);
exportToZipFile.setOnAction(event -> {
FileChooser fileChooser = new FileChooser();
FileChooser.ExtensionFilter exfilt = new FileChooser.ExtensionFilter("Zip Files", ".zip");
fileChooser.getExtensionFilters().add(exfilt);
try{
FileOutputStream fos = new FileOutputStream(fileChooser.showSaveDialog(primaryStage));
ZipOutputStream zipOut = new ZipOutputStream(fos);
for(int count = 0; count<=fileList.size()-1; count++){
File fileToZip = new File(String.valueOf(fileList.get(count)));
FileInputStream fis = new FileInputStream(fileToZip);
ZipEntry zipEntry = new ZipEntry(fileToZip.getName());
zipOut.putNextEntry(zipEntry);
byte[] bytes = new byte[1024];
int length;
while((length = fis.read(bytes)) >= 0) {
zipOut.write(bytes, 0, length);
}
fis.close();
}
zipOut.close();
fos.close();
}catch(IOException e1){
e1.printStackTrace();
}
});
primaryStage.setScene(new Scene(gp, 300, 275));
primaryStage.show();
gp.getChildren().addAll(exportToZipFile, labelforAddress, scrape, URLAddress);
}
public static void downloadFromURL(String url, File file){
try {
URL Url = new URL(url);
BufferedInputStream bis = new BufferedInputStream(Url.openStream());
FileOutputStream fis = new FileOutputStream(file);
byte[] buffer = new byte[1024];
int count = 0;
while((count = bis.read(buffer, 0,1024)) !=-1){
fis.write(buffer, 0, count);
}
fis.close();
bis.close();
}catch(IOException e){
e.printStackTrace();
}
}
public static void main(String[] args) {
launch(args);
}
}
Your text field containing the value entered by user is stored in URLAddress object but you always try to get the url from labelforAddress object which is a label always containing "URL" text.
So the solution is to use:
String address = URLAddress.getText();
If you read carefully error message it would help you to find the cause, because it always displays the value it considers wrong. In this case I see:
Caused by: java.net.MalformedURLException: no protocol: URL
and it shows the unrecognized address is: URL.
If you encounter this kind of error next time try:
debugging the aplication in runtime to see values of each variable
logging variable values in the console to see if variables contain values you expect

How to simulate taking picture and give input to the app?

I have a app in which i need to scan the QR code. Taking the picture from the app is not feasible as i need to run the app in multiple devices at once and it require human presence. How can i provide the QR code image/data to the app without scanning? Is there any way possible to simulate taking of picture and give store image as input to app?
If you have scanned test "QR Code image" then you can push it to the device from where app can read it.
You can ask dev team about the path from where app is reading the scanned image, and at same path you can push the test image.
Below is the code for how to push image file to device and other methods to push/pull different file formats
import java.awt.image.BufferedImage;
import io.appium.java_client.android.AndroidDriver;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;
import javax.imageio.ImageIO;
import org.apache.commons.codec.binary.Base64;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.remote.CapabilityType;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.testng.annotations.AfterTest;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
#Test
public class pushFileTest {
public static AndroidDriver<WebElement> _driver;
#BeforeClass
public void setUpAppium() throws InterruptedException, IOException {
DesiredCapabilities cap = new DesiredCapabilities();
cap.setCapability("platformVersion","5.1");
cap.setCapability("platformName","Android");
cap.setCapability("deviceName","ZX12222D");
cap.setCapability("appPackage","io.appium.android.apis");
cap.setCapability("appActivity","ApiDemos");
//System.out.println("Before calling appium");
_driver = new AndroidDriver<WebElement>(new URL("http://127.0.0.1:4725/wd/hub"), cap);
//System.out.println("After calling appium");
}
#Test
public void pullImageFileFromMobileSDCardTest() throws IOException {
byte[] returnData = _driver.pullFile("/storage/sdcard1/IMG_20140828_072840.jpg");
//System.out.println("Base 64 Converted String received from mobile :: " + returnData);
BufferedImage image=ImageIO.read(new ByteArrayInputStream(returnData));
ImageIO.write(image, "jpg", new File("C:\\eclipse","snap.jpg"));
}
/* Test Case to pull log file from mobile device*/
#Test
public void pullTextFileFromMobileSDCardTest() throws IOException {
byte[] returnData = _driver.pullFile("/storage/sdcard1/mili_log.txt");
//System.out.println(" Printing Text of File received from mobile :: " + new String(Base64.decodeBase64(returnData)));
File fs = new File("C:\\eclipse\\MobileFile.txt");
FileOutputStream fos = new FileOutputStream(fs);
fos.write(returnData);
fos.flush();
fos.close();
}
#Test
public void pushImageFileToMobileTest() throws IOException {
File fi = new File("C:\\eclipse\\img1.jpg");
byte[] fileContent = Files.readAllBytes(fi.toPath());
_driver.pushFile("/storage/sdcard1", fileContent);
}
#Test
public void pushTextFileToMobileTest() throws IOException {
File fi = new File("C:\\eclipse\\MobileFile.txt");
byte[] data = Files.readAllBytes(fi.toPath());
System.out.println("Base 64 Converted String sent to mobile :: " + data);
_driver.pushFile("/storage/sdcard1/appium.txt",data);
}
public void pullVideoFileFromMobileSDCardTest() throws IOException {
byte[] returnData = _driver.pullFile("/storage/sdcard1/VideoIconfile.mp4");
//System.out.println(" Printing Text of File received from mobile :: " + new String(Base64.decodeBase64(returnData)));
//File fs = new File("C:\\eclipse\\video.mp4");
FileOutputStream fos = new FileOutputStream("C:\\eclipse\\video.mp4");
fos.write(returnData);
fos.flush();
fos.close();
}
#AfterTest(alwaysRun= true)
public void tearDown(){
if (_driver!= null )
_driver.quit();
System.out.println("tearDown() :: driver.quit() executed");
}
}

why pdfstamper is not working to insert new buttons in itextpf

I just copied a program from website (I am sure it was written by Bruno) but does not work for me. I just have a very plain pdf fie and interested to insert buttons in it using this program exactly.
But it does not insert the button on the file. My original file is in "/WEB-INF/design100.pdf".
package admin_pack;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Set;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Rectangle;
import com.itextpdf.text.pdf.AcroFields;
import com.itextpdf.text.pdf.GrayColor;
import com.itextpdf.text.pdf.PdfAction;
import com.itextpdf.text.pdf.PdfFormField;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfStamper;
import com.itextpdf.text.pdf.PushbuttonField;
import com.itextpdf.text.pdf.TextField;
import javax.servlet.annotation.WebServlet;
/**
*
* #author Mushtaq
*/
#WebServlet(name = "FormServlet", urlPatterns = {"/FormServlet"})
public class FormServlet extends HttpServlet {
#Override
protected void doPost(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
response.setContentType("application/pdf");
try {
// We get a resource from our web app
InputStream is
= getServletContext().getResourceAsStream("/WEB-INF/design100.pdf");
// We create a reader with the InputStream
PdfReader reader = new PdfReader(is, null);
// We create an OutputStream for the new PDF
ByteArrayOutputStream baos = new ByteArrayOutputStream();
// Now we create the PDF
PdfStamper stamper = new PdfStamper(reader, baos);
// We add a submit button to the existing form
PushbuttonField button = new PushbuttonField(
stamper.getWriter(), new Rectangle(90, 660, 140, 690), "submit");
button.setText("POST");
button.setBackgroundColor(new GrayColor(0.7f));
button.setVisibility(PushbuttonField.VISIBLE_BUT_DOES_NOT_PRINT);
PdfFormField submit = button.getField();
// submit.setAction(PdfAction.createSubmitForm(
// "/book/form", null, PdfAction.SUBMIT_HTML_FORMAT));
stamper.addAnnotation(submit, 1);
stamper.close();
reader.close();
// We write the PDF bytes to the OutputStream
OutputStream os = response.getOutputStream();
baos.writeTo(os);
os.flush();
} catch (DocumentException e) {
throw new IOException(e.getMessage());
}
}
}

parsing with dom4j

I am successfully retrieve the data of response using xpath expression /abcde/response from the xml ,
<abcde>
<response>000</response>
</abcde>
But couldnt retrieve the data of response from the same xml but with some additional data
<abcde version="8.1" xmlns="http://www.litle.com/schema"
response="0" message="Valid Format">
<response>000</response>
</abcde>
What am i doing wrong ?
package stackoverflow;
import java.io.ByteArrayInputStream;
import java.util.HashMap;
import java.util.Map;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentFactory;
import org.dom4j.DocumentHelper;
import org.dom4j.XPath;
import org.dom4j.io.SAXReader;
import org.dom4j.xpath.DefaultXPath;
import org.jaxen.VariableContext;
public class MakejdomWork {
public static void main(String[] args) {
new MakejdomWork().run();
}
public void run() {
ByteArrayInputStream bis = new ByteArrayInputStream("<abcde version=\"8.1\" xmlns=\"http://www.litle.com/schema\" response=\"0\" message=\"Valid Format\"> <response>000</response></abcde>".getBytes());
//ByteArrayInputStream bis = new ByteArrayInputStream("<abcde><response>000</response></abcde>".getBytes());
Map nsPrefixes = new HashMap();
nsPrefixes.put( "x", "http://www.litle.com/schema" );
DocumentFactory factory = new DocumentFactory();
factory.setXPathNamespaceURIs( nsPrefixes );
SAXReader reader = new SAXReader();
reader.setDocumentFactory( factory );
Document doc;
try {
doc = reader.read( bis );
Object value = doc.valueOf("/abcde/x:response");
System.out.println(value);
} catch (DocumentException e) {
e.printStackTrace();
}
}
}
Short answer: you need to use namespace prefixes if your parser is namespace aware (which dom4j is)

Resources