Skip to content

Commit

Permalink
Implement Preprocessing of Pdfs Into Images + Setup Pdfium
Browse files Browse the repository at this point in the history
  • Loading branch information
JesusFileto committed Oct 29, 2024
1 parent 1b6cf0b commit edde66a
Show file tree
Hide file tree
Showing 5 changed files with 114 additions and 13 deletions.
2 changes: 2 additions & 0 deletions libs/chonky/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ edition.workspace = true
error-stack = { workspace = true, public = true }

# Public third-party dependencies
pdfium-render = { version = "0.8.25", public = true }
image = { version = "0.24.9", public = true }

# Private workspace dependencies

Expand Down
Binary file added libs/chonky/libs/libpdfium.dylib
Binary file not shown.
90 changes: 77 additions & 13 deletions libs/chonky/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,26 +1,90 @@
#![doc = include_str!("../README.md")]

/// Adds two numbers together
///
/// # Example
///
/// ```rust
/// use chonky::add;
use image::DynamicImage;
use pdfium_render::prelude::*;

use error_stack::Report;


pub fn load_pdf<'a>(pdfium: &'a Pdfium, file_path: &str)-> Result<PdfDocument<'a>, Report<PdfiumError>>{
let pdf = pdfium
.load_pdf_from_file(file_path, None)?;
Ok(pdf)
}

/// Takes in a pdf document and returns a vector list where each page
/// is processed into a raw image that can be later converted to any image format
///
/// assert_eq!(add(1, 3), 4);
/// ```
#[must_use]
pub const fn add(left: u64, right: u64) -> u64 {
left + right
pub fn pdf_to_images(pdf: &PdfDocument) -> Result<Vec<DynamicImage>,Report<PdfiumError>>{
let mut images: Vec<DynamicImage> = Vec::new();

for page in pdf.pages().iter() {

let resolution_width = 1000; //may adjust resolution depending on need

// Render the entire page to an image
let rendered_page = page
.render_with_config(
&PdfRenderConfig::new()
.set_target_width(resolution_width),
)?; // Renders the page to a PdfBitmap

// Convert PdfBitmap to DynamicImage
let dynamic_image = rendered_page.as_image();
images.push(dynamic_image);

Check failure

Code scanning / clippy

mismatched types Error

mismatched types

Check failure

Code scanning / clippy

mismatched types Error

mismatched types
}

Ok(images)
}


#[cfg(test)]
mod tests {
use super::*;

#[test]
fn it_works() {
let result = add(2, 2);
assert_eq!(result, 4);
fn pdf_load_success() {
let pdfium = Pdfium::new(
Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./libs/")).unwrap()
); // creates instance so must be global

let test_pdf_string = "tests/docs/test-doc.pdf";

let pdf = load_pdf(&pdfium, test_pdf_string);

assert!(pdf.is_ok());
}

#[test]
fn pdf_load_failure() {
let pdfium = Pdfium::new(
Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./libs/")).unwrap()
); // creates instance so must be global

let test_pdf_string = "tests/docs/invalid.pdf";

let pdf = load_pdf(&pdfium, test_pdf_string);

assert!(pdf.is_err());
}

#[test]
fn pdf_image_conversion() {
let pdfium = Pdfium::new(
Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./libs/")).unwrap()
); // creates instance so must be global

let test_pdf_string = "tests/docs/test-doc.pdf";

let pdf = load_pdf(&pdfium, test_pdf_string).unwrap();

let preprocessed_pdf = pdf_to_images(&pdf).unwrap();

let num_pages = 38; //number of pages of pdf

assert_eq!(preprocessed_pdf.len(), num_pages) //length of vector should be number of pages

}

}
35 changes: 35 additions & 0 deletions libs/chonky/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
use std::env;
use chonky::*;
use pdfium_render::prelude::*;
use error_stack::Report;

fn main() -> Result<(), Report<PdfiumError>>{
let args: Vec<String> = env::args().collect();// read file path arguments

if args.len() < 2 {
panic!("Path to PDF not inputted")
}

let pdfium = Pdfium::new(
Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./libs/")).unwrap()
); // creates instance so must be global

let pdf = load_pdf(&pdfium, &args[1])?;

let preprocessed_pdf = pdf_to_images(&pdf)?;

//for now we will print all these images to a folder
// this will be a seperate function in the future once knowledge about error-stack increases

let output_folder = "./out";

for (index, image) in preprocessed_pdf.iter().enumerate() {
// Generate a unique filename for each page image
let file_path = format!("{}/page_{}.png", output_folder, index + 1);

// Save the image as a PNG file
image.save(&file_path).unwrap();
}

Ok(())
}
Binary file added libs/chonky/tests/docs/test-doc.pdf
Binary file not shown.

0 comments on commit edde66a

Please sign in to comment.