I'm using GdPicturePro .NET v8 to process some PDF files. I'm rendering them to gdpicture images and then OCR them and save them to a new PDF. This is going magnitudes slower than the previous method in v7. I can't figure out why. I am releasing the image handle when done, too. Based on my code, am I doing anything that would terribly slow this down? It will burn through the first few pages of a PDF in a matter of second, but then begin to halt around the 15th page. (these are 500+ page PDFs) At this rate, this PDF could take a few hours to complete, whereas my method in the v7 edition would finish in a about 20-30 minutes. Any ideas?
Edit: I wanted to mention that this is compiled for 64-bit processor on a machine with 12 cores. It seems to only be utilizing one core, though. It's maxing out one core. Is there something that needs to be done to allow it to utilize more than one core?
I have pasted my code here for easy readability: http://dpaste.org/LGeH/
But also will paste it below for easy access.
Code: Select all
using System;
using System.IO;
using System.Reflection;
using GdPicture;
namespace NewGdPictureTest
{
class Program
{
static void Main(string[] args)
{
// make sure we have enough arguments
if (args.Length != 2)
{
PrintUsage();
Environment.Exit(0);
}
// make sure the input file exists
String file_original = args[0];
if (!File.Exists(file_original))
{
Log(String.Format("Error opening original pdf: {0}", file_original));
Environment.Exit(0);
}
// make sure the dictionaries directory exists
String dictionaries = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location) + @"\dictionaries";
if (!Directory.Exists(dictionaries))
{
Log("The dictionaries do not exist.");
Environment.Exit(0);
}
// delete the previous output file if it exists
String file_produced = args[1];
if (File.Exists(file_produced))
File.Delete(file_produced);
// instanciate all the required objects
GdPicturePDF the_pdf = new GdPicturePDF();
GdPictureImaging the_imaging = new GdPictureImaging();
// unlock them
if (!the_pdf.SetLicenseNumber("xxx") || !the_imaging.SetLicenseNumber("xxx") || !the_imaging.SetLicenseNumberOCRTesseract("xxx"))
{
Log("Invalid license numbers.");
Environment.Exit(0);
}
// flatten the original pdf into image only pdf
if (the_pdf.LoadFromFile(file_original, false) == GdPictureStatus.OK)
{
// create the final OCRed pdf
int ocr_id = the_imaging.PdfOCRStart(file_produced, false, "", "", "", "", "");
int page_count = the_pdf.GetPageCount();
for (int x = 1; x <= page_count; x++)
{
Console.WriteLine("Flattening and OCRing page {0} of {1}.", x, page_count);
// activate the current page
if (the_pdf.SelectPage(x))
{
// rasterize the active page
int image_id = the_pdf.RenderPageToGdPictureImage(200, true);
if (image_id > 0)
{
the_imaging.PdfAddGdPictureImageToPdfOCR(ocr_id, image_id, "eng", dictionaries, "");
the_imaging.ReleaseGdPictureImage(image_id);
}
else
{
Log(String.Format("Could not rasterize page {0}.", x));
}
}
else
{
Log(String.Format("Could not select page {0}.", x));
}
}
// stop the OCRing
the_imaging.PdfOCRStop(ocr_id);
// close the original pdf
the_pdf.CloseDocument();
}
else
{
Log(String.Format("Error opening original pdf; {0}.", file_original));
}
}
/// <summary>
/// display the required parameters.
/// </summary>
static void PrintUsage()
{
Log("Usage: TheFlattener.exe <string:input_file> <string:output_file>");
}
/// <summary>
/// write to log file.
/// </summary>
static void Log(String p_message)
{
Console.WriteLine(p_message);
using (StreamWriter file = new StreamWriter(String.Format("{0}.log", DateTime.Today.ToString("yy-MM-dd")), true))
{
file.WriteLine(String.Format("{0} - {1}", DateTime.Now.ToString(), p_message));
}
}
}
}