by
8,582 18
0
6
2,786
3
Top 1% !
Popular
Famous
Easy-to-find
Specified
OpenSource
Popularity: 5874th place
Languagestata
LicenseMIT_X11

How to Extract Text from PDF Documents Based on Columns inside .NET Apps

Copy Embed Code
<iframe id="embedFrame" style="width:600px; height:300px;"
src="https://www.snip2code.com/Embed/1021582/How-to-Extract-Text-from-PDF-Documents-B?startLine=0"></iframe>
Click on the embed code to copy it into your clipboard Width Height
Leave empty to retrieve all the content Start End
// Enter here the actual content of the snippet. //The following code snippet shows the steps to reduce text size and then try extracting text from PDF document. //[C# Code Sample] string path = "D:\\Temp\\"; InitLicense(); Document pdfDocument = new Document(path + "net_New-age NED's.pdf"); TextFragmentAbsorber tfa = new TextFragmentAbsorber(); pdfDocument.Pages.Accept(tfa); TextFragmentCollection tfc = tfa.TextFragments; foreach (TextFragment tf in tfc) { //need to reduce font size at least for 70% tf.TextState.FontSize = tf.TextState.FontSize * 0.7f; } Stream st = new MemoryStream(); pdfDocument.Save(st); pdfDocument = new Document(st); TextAbsorber textAbsorber = new TextAbsorber(); pdfDocument.Pages.Accept(textAbsorber); String extractedText = textAbsorber.Text; textAbsorber.Visit(pdfDocument); System.IO.File.WriteAllText(path + "Extracted.txt", extractedText); // [VB.NET Code Sample] Dim path As String = "D:\\Temp\\" ' instantiate Document object Dim pdfDocument As Document = New Document(path + "net_New-age NED's.pdf") Dim tfa As Aspose.Pdf.Text.TextFragmentAbsorber = New Aspose.Pdf.Text.TextFragmentAbsorber() pdfDocument.Pages.Accept(tfa) Dim tfc As Aspose.Pdf.Text.TextFragmentCollection = tfa.TextFragments For Each tf As Aspose.Pdf.Text.TextFragment In tfc ' need to reduce font size at least for 70% tf.TextState.FontSize = tf.TextState.FontSize * 0.7F Next ' create temporary stream object Dim st As Stream = New MemoryStream() ' save PDF file with reduced font size pdfDocument.Save(st) ' Instantiate Document object with stream instance pdfDocument = New Document(st) Dim textAbsorber As Aspose.Pdf.Text.TextAbsorber = New Aspose.Pdf.Text.TextAbsorber() pdfDocument.Pages.Accept(textAbsorber) Dim extractedText As String = textAbsorber.Text textAbsorber.Visit(pdfDocument) System.IO.File.WriteAllText(path + "Extracted.txt", extractedText) //Second approach - Using ScaleFactor //[C# Code Sample] Document pdfDocument = new Document(inputFile); TextAbsorber textAbsorber = new TextAbsorber(); textAbsorber.ExtractionOptions = new TextExtractionOptions(TextExtractionOptions.TextFormattingMode.Pure); //Setting scale factor to 0.5 is enough to split columns in the majority of documents //Setting of zero allows to algorithm choose scale factor automatically textAbsorber.ExtractionOptions.ScaleFactor = 0.5; /* 0; */ pdfDocument.Pages.Accept(textAbsorber); String extractedText = textAbsorber.Text; System.IO.File.WriteAllText(outFile, extractedText); // [VB.NET Code Sample] Dim pdfDocument As Document = New Document(inputFile) Dim textAbsorber As Aspose.Pdf.Text.TextAbsorber = New Aspose.Pdf.Text.TextAbsorber() textAbsorber.ExtractionOptions = New TextExtractionOptions(TextExtractionOptions.TextFormattingMode.Pure) 'Setting scale factor to 0.5 is enough to split columns in the majority of documents 'Setting of zero allows to algorithm choose scale factor automatically textAbsorber.ExtractionOptions.ScaleFactor = 0.5 ' 0; pdfDocument.Pages.Accept(textAbsorber) Dim extractedText As String = textAbsorber.Text System.IO.File.WriteAllText(outFile, extractedText)
If you want to be updated about similar snippets, Sign in and follow our Channels

blog comments powered by Disqus