Skip to content
This repository has been archived by the owner on Jul 20, 2023. It is now read-only.

Beta2ga pdf ocr #104

Merged
merged 17 commits into from
Jul 2, 2018
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions samples/detect.js
Original file line number Diff line number Diff line change
Expand Up @@ -770,6 +770,67 @@ function detectFulltextGCS(bucketName, fileName) {
// [END vision_fulltext_detection_gcs]
}

function detectPdfText(bucketName, fileName) {
// [START vision_async_detect_document_ocr]

// Imports the Google Cloud client libraries
const vision = require('@google-cloud/vision').v1;

// Creates a client
const client = new vision.ImageAnnotatorClient();

/**
* TODO(developer): Uncomment the following lines before running the sample.
*/
// Bucket where the file resides
// const bucketName = 'my-bucket';
// Path to PDF file within bucket
// const fileName = 'path/to/document.pdf';

const gcsSourceUri = `gs://${bucketName}/${fileName}`;
const gcsDestinationUri = `gs://${bucketName}/${fileName}.json`;

const inputConfig = {
// Supported mime_types are: 'application/pdf' and 'image/tiff'
mimeType: 'application/pdf',
gcsSource: {
uri: gcsSourceUri,
},
};
const outputConfig = {
gcsDestination: {
uri: gcsDestinationUri,
},
};
const features = [{type: 'DOCUMENT_TEXT_DETECTION'}];
const request = {
requests: [
{
inputConfig: inputConfig,
features: features,
outputConfig: outputConfig,
},
],
};

client
.asyncBatchAnnotateFiles(request)
.then(results => {
const operation = results[0];
// Get a Promise representation of the final result of the job
return operation.promise();
})
.then(filesResponse => {
let destinationUri =
filesResponse[0].responses[0].outputConfig.gcsDestination.uri;
console.log('Json saved to: ' + destinationUri);
})
.catch(function(error) {

This comment was marked as spam.

console.log(error);
});
// [END vision_async_detect_document_ocr]
}

require(`yargs`) // eslint-disable-line
.demand(1)
.command(
Expand Down Expand Up @@ -901,6 +962,12 @@ require(`yargs`) // eslint-disable-line
{},
opts => detectFulltextGCS(opts.bucketName, opts.fileName)
)
.command(
`pdf <bucketName> <fileName>`,
`Extracts full text from a pdf file`,
{},
opts => detectPdfText(opts.bucketName, opts.fileName)
)
.example(`node $0 faces ./resources/face_no_surprise.jpg`)
.example(`node $0 faces-gcs my-bucket your-image.jpg`)
.example(`node $0 labels ./resources/wakeupcat.jpg`)
Expand All @@ -923,6 +990,7 @@ require(`yargs`) // eslint-disable-line
.example(`node $0 web-geo-gcs my-bucket your-image.jpg`)
.example(`node $0 fulltext ./resources/wakeupcat.jpg`)
.example(`node $0 fulltext-gcs my-bucket your-image.jpg`)
.example(`node $0 pdf my-bucket my-pdf.pdf`)
.wrap(120)
.recommendCommands()
.epilogue(`For more information, see https://cloud.google.com/vision/docs`)
Expand Down
97 changes: 0 additions & 97 deletions samples/detect.v1p2beta1.js

This file was deleted.

9 changes: 9 additions & 0 deletions samples/system-test/detect.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ const files = [
`wakeupcat.jpg`,
`faulkner.jpg`,
`city.jpg`,
'pdf-ocr.pdf',
].map(name => {
return {
name,
Expand Down Expand Up @@ -252,3 +253,11 @@ test(`should read a document from a remote file`, async t => {
);
t.true(output.includes('Google Cloud Platform'));
});

test(`should extract text from pdf file`, async t => {
const output = await tools.runAsync(
`${cmd} pdf ${bucketName} ${files[7].name}`,
cwd
);
t.true(output.includes('pdf-ocr.pdf.json'));
});
56 changes: 0 additions & 56 deletions samples/system-test/detect.v1p2beta1.test.js

This file was deleted.