Skip to content

Commit

Permalink
update scala (#13412)
Browse files Browse the repository at this point in the history
  • Loading branch information
aymanechilah authored Jan 24, 2023
1 parent 3ed9aea commit 9922f40
Show file tree
Hide file tree
Showing 21 changed files with 132 additions and 160 deletions.
61 changes: 27 additions & 34 deletions docs/_posts/aymanechilah/2023-01-03-image_processing_en_3_2.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,40 +186,33 @@ val multiple_image = new GPUImageTransformer()
.setOutputCol("multiple_image")
.transform(image_df)

val pipeline_scaled = new PipelineModel.setStages(Array(
binary_to_image,
scaled_image_df
))

val pipeline_thresholded = new PipelineModel.setStages(Array(
binary_to_image,
thresholded_image
))

val pipeline_eroded = new PipelineModel.setStages(Array(
binary_to_image,
eroded_image
))

val pipeline_dilated = new PipelineModel.setStages(Array(
binary_to_image,
dilated_image
))

val pipeline_removebg = new PipelineModel.setStages(Array(
binary_to_image,
removebg_image
))

val pipeline_deblured = new PipelineModel.setStages(Array(
binary_to_image,
deblured_image
))

val pipeline_multiple = new PipelineModel.setStages(Array(
binary_to_image,
multiple_image
))
val pipeline_scaled = new PipelineModel().setStages(Array(
binary_to_image,
scaled_image_df))

val pipeline_thresholded = new PipelineModel().setStages(Array(
binary_to_image,
thresholded_image))

val pipeline_eroded = new PipelineModel().setStages(Array(
binary_to_image,
eroded_image))

val pipeline_dilated = new PipelineModel().setStages(Array(
binary_to_image,
dilated_image))

val pipeline_removebg = new PipelineModel().setStages(Array(
binary_to_image,
removebg_image))

val pipeline_deblured = new PipelineModel().setStages(Array(
binary_to_image,
deblured_image))

val pipeline_multiple = new PipelineModel().setStages(Array(
binary_to_image,
multiple_image))

val image_path = pkg_resources.resource_filename("sparkocr", "resources/ocr/images/check.jpg")
val image_example_df = spark.read.format("binaryFile").load(image_path)
Expand Down
13 changes: 6 additions & 7 deletions docs/_posts/aymanechilah/2023-01-03-ner_deid_large_en_3_2.md
Original file line number Diff line number Diff line change
Expand Up @@ -196,13 +196,12 @@ val drawRegions = new ImageDrawRegions()
.setRectColor(Color.gray)

# OCR pipeline
val pipeline = new Pipeline.setStages(Array(
binary_to_image,
ocr,
deidentification_nlp_pipeline(input_column="text"),
position_finder,
drawRegions
))
val pipeline = new Pipeline().setStages(Array(
binary_to_image,
ocr,
deidentification_nlp_pipeline(input_column="text"),
position_finder,
drawRegions))

val image_path = pkg_resources.resource_filename(Array("sparkocr", "resources/ocr/images/p1.jpg"))
val image_df = spark.read.format("binaryFile").load(image_path)
Expand Down
7 changes: 3 additions & 4 deletions docs/_posts/aymanechilah/2023-01-03-ocr_streaming_en_3_0.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,9 @@ val ocr = new ImageToText()
.setConfidenceThreshold(60)

# OCR pipeline
val pipeline = new PipelineModel.setStages(Array(
pdf_to_image,
ocr
))
val pipeline = new PipelineModel().setStages(Array(
pdf_to_image,
ocr))

# fill path to folder with PDF's here
val dataset_path = "data/pdfs/*.pdf"
Expand Down
9 changes: 4 additions & 5 deletions docs/_posts/aymanechilah/2023-01-03-pdf_processing_en_3_2.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,10 @@ val ocr = new ImageToText()
.setOutputCol("text")
.setConfidenceThreshold(60)

val pipeline = new PipelineModel.setStages(Array(
pdf_to_text,
pdf_to_image,
ocr
))
val pipeline = new PipelineModel().setStages(Array(
pdf_to_text,
pdf_to_image,
ocr))

val pdf_path = pkg_resources.resource_filename("sparkocr", "resources/ocr/pdfs/*.pdf")
val pdf_example_df = spark.read.format("binaryFile").load(pdf_path).cache()
Expand Down
19 changes: 9 additions & 10 deletions docs/_posts/aymanechilah/2023-01-03-table_recognition_en_3_3.md
Original file line number Diff line number Diff line change
Expand Up @@ -169,16 +169,15 @@ val ocr = new ImageToText()
.setKeepLayout(True)
.setOutputSpaceCharacterWidth(8)

val pipeline_table = new PipelineModel.setStages(Array(
binary_to_image,
table_detector,
draw_regions,
fill_regions,
splitter,
cell_detector,
table_recognition,
ocr
))
val pipeline_table = new PipelineModel().setStages(Array(
binary_to_image,
table_detector,
draw_regions,
fill_regions,
splitter,
cell_detector,
table_recognition,
ocr))

val imagePath = "/content/cTDaR_t10096.jpg"
val df = spark.read.format("binaryFile").load(imagePath)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,9 @@ val doc_class = VisualDocumentClassifierV3()
.setOutputCol("label")

# OCR pipeline
val pipeline = new PipelineModel.setStages(Array(
binary_to_image,
doc_class
))
val pipeline = new PipelineModel().setStages(Array(
binary_to_image,
doc_class))

val test_image_path = pkg_resources.resource_filename("sparkocr", "resources/ocr/visualdoc/00556614_00556648.tif")
val bin_df = spark.read.format("binaryFile").load(test_image_path).limit(50)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,10 @@ val draw_regions = new ImageDrawRegions()
.setOutputCol("image_with_regions")
.setRectColor(Color.red)

val pipeline = new PipelineModel.setStages(Array(
binary_to_image,
table_detector,
draw_regions
))
val pipeline = new PipelineModel().setStages(Array(
binary_to_image,
table_detector,
draw_regions))

# Download image:
# !wget -q https://github.com/JohnSnowLabs/spark-ocr-workshop/raw/4.0.0-release-candidate/jupyter/data/tab_images/cTDaR_t10168.jpg
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,10 @@ val draw_regions = new ImageDrawRegions()
.setRectColor(Color.green)
.setRotated(True)

val pipeline = new PipelineModel.setStages(Array(
binary_to_image,
text_detector,
draw_regions
))
val pipeline = new PipelineModel().setStages(Array(
binary_to_image,
text_detector,
draw_regions))

val imagePath = pkg_resources.resource_filename("sparkocr", "resources/ocr/text_detection/020_Yas_patella1.jpg")
val image_df = spark.read.format("binaryFile").load(imagePath).sort("path")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,11 @@ val draw_regions = new ImageDrawRegions()
.setRectColor(Color.green)
.setRotated(True)

val pipeline = new PipelineModel.setStages(Array(
binary_to_image,
text_detector,
ocr,
draw_regions
))
val pipeline = new PipelineModel().setStages(Array(
binary_to_image,
text_detector,
ocr,
draw_regions))

# Download image:
# !wget -q https://github.com/JohnSnowLabs/spark-ocr-workshop/raw/4.0.0-release-candidate/jupyter/data/handwritten/handwritten_example.jpg
Expand Down
13 changes: 6 additions & 7 deletions docs/_posts/aymanechilah/2023-01-10-ocr_small_printed_en_2_4.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,13 +108,12 @@ val draw_annotations = new ImageDrawAnnotations()
.setFontSize(14)
.setRectColor(Color.red)

val pipeline = new PipelineModel.setStages(Array(
binary_to_image,
text_detector,
ocr,
tokenizer,
draw_annotations
))
val pipeline = new PipelineModel().setStages(Array(
binary_to_image,
text_detector,
ocr,
tokenizer,
draw_annotations))

val image_path = pkg_resources.resource_filename("sparkocr", "resources/ocr/images/check.jpg"")
val image_example_df = spark.read.format("binaryFile").load(image_path)
Expand Down
11 changes: 5 additions & 6 deletions docs/_posts/aymanechilah/2023-01-10-text_cleaner_v1_en_3_2.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,11 @@ val ocr_corrected = new ImageToText()
.setConfidenceThreshold(70)
.setIgnoreResolution(False)

val pipeline = new PipelineModel.setStages(Array(
pdf_to_image,
ocr,
cleaner,
ocr_corrected
))
val pipeline = new PipelineModel().setStages(Array(
pdf_to_image,
ocr,
cleaner,
ocr_corrected))

val pdf_example = "data/pdfs/noised.pdf"
val pdf_example_df = spark.read.format("binaryFile").load(pdf_example).cache()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,13 +115,12 @@ val draw = new ImageDrawAnnotations()
.setRectColor(Color.red)

# OCR pipeline
val pipeline = new PipelineModel.setStages(Array(
binary_to_image,
img_to_hocr,
tokenizer,
doc_ner,
draw
))
val pipeline = new PipelineModel().setStages(Array(
binary_to_image,
img_to_hocr,
tokenizer,
doc_ner,
draw))

val bin_df = spark.read.format("binaryFile").load("data/t01.jpg")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,15 +113,14 @@ val draw = new ImageDrawAnnotations()
.setRectColor(Color.red)

# OCR pipeline
val pipeline = new PipelineModel.setStages(Array(
binary_to_image,
img_to_hocr,
tokenizer,
doc_ner,
draw
))
val pipeline = new PipelineModel().setStages(Array(
binary_to_image,
img_to_hocr,
tokenizer,
doc_ner,
draw))

val bin_df = spark.read.format("binaryFile").load("data/t01.jpg")
val bin_df = spark.read.format("binaryFile").load('data/t01.jpg')

val results = pipeline.transform(bin_df).cache()
val res = results.collect()
Expand Down Expand Up @@ -182,4 +181,4 @@ val results.withColumn("filename", path_array.getItem(f.size(path_array)- 1))

## References

Sec 10K filings
Sec 10K filings
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,9 @@ val visual_question_answering = VisualQuestionAnswering()
.setQuestionsCol("questions")

# OCR pipeline
val pipeline = new PipelineModel.setStages(Array(
binary_to_image,
visual_question_answering
))
val pipeline = new PipelineModel().setStages(Array(
binary_to_image,
visual_question_answering))

val test_image_path = pkg_resources.resource_filename("sparkocr", "resources/ocr/vqa/agenda.png")
val bin_df = spark.read.format("binaryFile").load(test_image_path)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,9 @@ val visual_question_answering = VisualQuestionAnswering()
.setQuestionsCol("questions")

# OCR pipeline
val pipeline = new PipelineModel.setStages(Array(
binary_to_image,
visual_question_answering
))
val pipeline = new PipelineModel().setStages(Array(
binary_to_image,
visual_question_answering))

val test_image_path = pkg_resources.resource_filename("sparkocr", "resources/ocr/vqa/agenda.png")
val bin_df = spark.read.format("binaryFile").load(test_image_path)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,11 @@ val draw_regions = new ImageDrawRegions()
.setRectColor(Color.green)
.setRotated(True)

val pipeline = new PipelineModel.setStages(Array(
binary_to_image,
text_detector,
ocr,
draw_regions
))
val pipeline = new PipelineModel().setStages(Array(
binary_to_image,
text_detector,
ocr,
draw_regions))

# Download image:
# !wget -q https://github.com/JohnSnowLabs/spark-ocr-workshop/raw/4.0.0-release-candidate/jupyter/data/handwritten/handwritten_example.jpg
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,11 @@ val draw_regions = new ImageDrawRegions()
.setRectColor(Color.green)
.setRotated(True)

val pipeline = new PipelineModel.setStages(Array(
binary_to_image,
text_detector,
ocr,
draw_regions
))
val pipeline = new PipelineModel().setStages(Array(
binary_to_image,
text_detector,
ocr,
draw_regions))

# Download image:
# !wget -q https://github.com/JohnSnowLabs/spark-ocr-workshop/raw/4.0.0-release-candidate/jupyter/data/handwritten/handwritten_example.jpg
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,12 +102,10 @@ val draw_regions = new ImageDrawRegions()
.setRectColor(Color.green)
.setRotated(True)

val pipeline = new PipelineModel.setStages(Array(
binary_to_image,
text_detector,
ocr,
draw_regions
))
val pipeline = new PipelineModel().setStages(Array(binary_to_image,
text_detector,
ocr,
draw_regions))

val imagePath = pkg_resources.resource_filename("sparkocr", "resources/ocr/images/check.jpg")
val image_df = spark.read.format("binaryFile").load(imagePath)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,11 @@ val draw_regions = new ImageDrawRegions()
.setRectColor(Color.green)
.setRotated(True)

val pipeline = new PipelineModel.setStages(Array(
binary_to_image,
text_detector,
ocr,
draw_regions
))
val pipeline = new PipelineModel().setStages(Array(
binary_to_image,
text_detector,
ocr,
draw_regions))

val imagePath = pkg_resources.resource_filename("sparkocr", "resources/ocr/images/check.jpg")
val image_df = spark.read.format("binaryFile").load(imagePath)
Expand Down
Loading

0 comments on commit 9922f40

Please sign in to comment.