What I'm trying to do is to extract text from a PDF file in a batch operation, so I don't need to show the PDF file to the user. In fact, I can work entirely without a visual interface for this operation: I only need to show the extracted text at the end.
So, I'm using this code, taken from the sdk sample:
Code: Select all
public static bool extracttextfromfile(string filepath, bool moreaccuracy,
bool usearea, int AreaLeft, int AreaTop, int AreaWidth, int AreaHeight,
string whitelist, string blacklist,
out string resulttext, out string error)
{
bool result = false;
resulttext = string.Empty;
error = string.Empty;
try
{
GdPicture14.LicenseManager lm = new GdPicture14.LicenseManager();
lm.RegisterKEY(twainlicensekey);//private variable containing the key
lm.RegisterKEY(pluginlicensekey);//private variable containing the key
using (GdPictureImaging gdimg = new GdPictureImaging())
{
int imagehandle = 0;
if (GdPictureDocumentUtilities.GetDocumentFormat(filepath) == GdPicture14.DocumentFormat.DocumentFormatPDF)
{
using (GdPicturePDF gdPicturePDF = new GdPicturePDF())
{
if (gdPicturePDF.LoadFromFile(filepath, false) == GdPictureStatus.OK)
{
imagehandle = gdPicturePDF.RenderPageToGdPictureImageEx(200, true);
gdPicturePDF.CloseDocument();
}
}
}
else
{
imagehandle = gdimg.CreateGdPictureImageFromFile(filepath);
}
if (imagehandle != 0)
{
RotateFlipType pageRotate = (RotateFlipType)(gdimg.TagGetExifRotation(imagehandle));
if (pageRotate != (RotateFlipType)GdPictureRotateFlipType.GdPictureRotateNoneFlipNone)
{
gdimg.Rotate(imagehandle, pageRotate);
gdimg.TagDeleteAll(imagehandle);
}
}
gdimg.OCRTesseractSetOCRContext(OCRContext.OCRContextSingleBlock);//I'm using this one for now, more tests later on the context
if (usearea)
{
gdimg.SetROI(AreaLeft, AreaTop, AreaWidth, AreaHeight);
}
else
{
gdimg.ResetROI();
}
gdimg.OCRTesseractReinit();
///ref: https://www.gdpicture.com/guides/gdpicture/Overview.html#Affect%20Tesseract%20OCR%20engine%20with%20special%20parameters.html
gdimg.OCRTesseractSetVariable("tessedit_char_blacklist", blacklist);
if (moreaccuracy)
{
gdimg.OCRTesseractSetPassCount(0);//0 means all possible passes.
}
else
{
gdimg.OCRTesseractSetPassCount(1);
}
resulttext = gdimg.OCRTesseractDoOCR(imagehandle, "ita", @"D:\GdPicture.NET 14\Redist\OCR\", whitelist);
if (gdimg.GetStat() == GdPictureStatus.OCRDictionaryNotFound)
{
error = "Dizionario non trovato nel percorso specificato!";
}
else
{
result = !string.IsNullOrEmpty(resulttext);
}
}
}
catch (Exception err)
{
result = false;
resulttext = string.Empty;
error = err.Message;
}
return result;
}
if (gdPicturePDF.LoadFromFile(filepath, false) == GdPictureStatus.OK)
This is the error:
What am I doing wrong? I guess the LoadFromFile is somewhat related to pdf viewing library, that I do not have a license for, but the sales support said I can go with the TWAIN and plugin only if I didn't need to show the PDF file on my window.
Is it something else? or there is a way to get the pdf image render without using that function, when I work in background?
Thanks