From fc21e033a6f202a9f2ad3d90031fd2c48f563996 Mon Sep 17 00:00:00 2001 From: Ahmetemintek Date: Wed, 19 Oct 2022 19:05:48 +0700 Subject: [PATCH 01/57] Add model 2022-10-19-ner_covid_trials_en --- .../2022-10-19-ner_covid_trials_en.md | 209 ++++++++++++++++++ 1 file changed, 209 insertions(+) create mode 100644 docs/_posts/Ahmetemintek/2022-10-19-ner_covid_trials_en.md diff --git a/docs/_posts/Ahmetemintek/2022-10-19-ner_covid_trials_en.md b/docs/_posts/Ahmetemintek/2022-10-19-ner_covid_trials_en.md new file mode 100644 index 00000000000000..b4ecb28ea1eb8b --- /dev/null +++ b/docs/_posts/Ahmetemintek/2022-10-19-ner_covid_trials_en.md @@ -0,0 +1,209 @@ +--- +layout: model +title: Extract Entities in Covid Trials +author: John Snow Labs +name: ner_covid_trials +date: 2022-10-19 +tags: [ner, en, clinical, licensed, covid] +task: Named Entity Recognition +language: en +edition: Spark NLP for Healthcare 4.2.0 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained named entity recognition deep learning model for extracting covid-related clinical terminology from covid trials. + +## Predicted Entities + +`Stage`, `Severity`, `Virus`, `Trial_Design`, `Trial_Phase`, `N_Patients`, `Institution`, `Statistical_Indicator`, `Section_Header`, `Cell_Type`, `Cellular_component`, `Viral_components`, `Physiological_reaction`, `Biological_molecules`, `Admission_Discharge`, `Age`, `BMI`, `Cerebrovascular_Disease`, `Date`, `Death_Entity`, `Diabetes`, `Disease_Syndrome_Disorder`, `Dosage`, `Drug_Ingredient`, `Employment`, `Frequency`, `Gender`, `Heart_Disease`, `Hypertension`, `Obesity`, `Pulse`, `Race_Ethnicity`, `Respiration`, `Route`, `Smoking`, `Time`, `Total_Cholesterol`, `Treatment`, `VS_Finding`, `Vaccine`, `Vaccine_Name` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/NER_COVID/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/1.Clinical_Named_Entity_Recognition_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_covid_trials_en_4.2.0_3.0_1666177383134.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") \ + .setInputCols(["document"]) \ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_covid_trials","en","clinical/models")\ + .setInputCols(["sentence","token","embeddings"])\ + .setOutputCol("ner")\ + .setLabelCasing("upper") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +ner_pipeline = Pipeline(stages=[ + documentAssembler, + sentenceDetector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +empty_data = spark.createDataFrame([[""]]).toDF("text") + +ner_model = ner_pipeline.fit(empty_data) + +text= """In December 2019 , a group of patients with the acute respiratory disease was detected in Wuhan , Hubei Province of China . A month later , a new beta-coronavirus was identified as the cause of the 2019 coronavirus infection . SARS-CoV-2 is a coronavirus that belongs to the group of β-coronaviruses of the subgenus Coronaviridae . The SARS-CoV-2 is the third known zoonotic coronavirus disease after severe acute respiratory syndrome ( SARS ) and Middle Eastern respiratory syndrome ( MERS ). The diagnosis of SARS-CoV-2 recommended by the WHO , CDC is the collection of a sample from the upper respiratory tract ( nasal and oropharyngeal exudate ) or from the lower respiratory tractsuch as expectoration of endotracheal aspirate and bronchioloalveolar lavage and its analysis using the test of real-time polymerase chain reaction ( qRT-PCR ).In 2020, the first COVID‑19 vaccine was developed and made available to the public through emergency authorizations and conditional approvals.""" + +results= model.transform(spark.createDataFrame([[text]]).toDF('text')) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical" ,"en", "clinical/models") + .setInputCols(Array("sentence","token")) + .setOutputCol("embeddings") + +val ner_model = MedicalNerModel.pretrained("ner_covid_trials", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner_model, + ner_converter)) + +val data = Seq("""In December 2019 , a group of patients with the acute respiratory disease was detected in Wuhan , Hubei Province of China . A month later , a new beta-coronavirus was identified as the cause of the 2019 coronavirus infection . SARS-CoV-2 is a coronavirus that belongs to the group of β-coronaviruses of the subgenus Coronaviridae . The SARS-CoV-2 is the third known zoonotic coronavirus disease after severe acute respiratory syndrome ( SARS ) and Middle Eastern respiratory syndrome ( MERS ). The diagnosis of SARS-CoV-2 recommended by the WHO , CDC is the collection of a sample from the upper respiratory tract ( nasal and oropharyngeal exudate ) or from the lower respiratory tractsuch as expectoration of endotracheal aspirate and bronchioloalveolar lavage and its analysis using the test of real-time polymerase chain reaction ( qRT-PCR ).In 2020, the first COVID‑19 vaccine was developed and made available to the public through emergency authorizations and conditional approvals.""").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| | chunks | begin | end | sentence_id | entities | +|---:|:------------------------------------|--------:|------:|--------------:|:--------------------------| +| 0 | December 2019 | 3 | 15 | 0 | Date | +| 1 | acute respiratory disease | 48 | 72 | 0 | Disease_Syndrome_Disorder | +| 2 | beta-coronavirus | 146 | 161 | 1 | Virus | +| 3 | 2019 | 198 | 201 | 1 | Date | +| 4 | coronavirus infection | 203 | 223 | 1 | Disease_Syndrome_Disorder | +| 5 | SARS-CoV-2 | 228 | 237 | 2 | Virus | +| 6 | coronavirus | 244 | 254 | 2 | Virus | +| 7 | β-coronaviruses | 285 | 299 | 2 | Virus | +| 8 | subgenus Coronaviridae | 308 | 329 | 2 | Virus | +| 9 | SARS-CoV-2 | 337 | 346 | 3 | Virus | +| 10 | zoonotic coronavirus disease | 367 | 394 | 3 | Disease_Syndrome_Disorder | +| 11 | severe acute respiratory syndrome | 402 | 434 | 3 | Disease_Syndrome_Disorder | +| 12 | SARS | 438 | 441 | 3 | Disease_Syndrome_Disorder | +| 13 | Middle Eastern respiratory syndrome | 449 | 483 | 3 | Disease_Syndrome_Disorder | +| 14 | MERS | 487 | 490 | 3 | Disease_Syndrome_Disorder | +| 15 | SARS-CoV-2 | 513 | 522 | 4 | Virus | +| 16 | WHO | 543 | 545 | 4 | Institution | +| 17 | CDC | 549 | 551 | 4 | Institution | +| 18 | 2020 | 852 | 855 | 5 | Date | +| 19 | COVID‑19 vaccine | 868 | 883 | 5 | Vaccine_Name | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_covid_trials| +|Compatibility:|Spark NLP for Healthcare 4.2.0+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|14.8 MB| + +## References + +This model is trained on data sampled from clinicaltrials.gov - covid trials, and annotated in-house. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + Institution 34 8 20 55.0 0.7958 0.6343 0.706 + VS_Finding 19 2 1 20.0 0.9048 0.95 0.9268 + Respiration 5 0 0 5.0 1.0 1.0 1.0 +Cerebrovascular_D... 5 2 2 7.0 0.7143 0.7143 0.7143 + Cell_Type 152 27 14 167.0 0.8479 0.9123 0.8789 + Heart_Disease 36 3 5 41.0 0.9231 0.878 0.9 + Severity 57 25 3 60.0 0.6881 0.95 0.7981 + N_Patients 27 3 1 29.0 0.8871 0.9483 0.9167 + Pulse 12 2 0 12.0 0.8571 1.0 0.9231 + Obesity 3 0 0 3.0 1.0 1.0 1.0 + Admission_Discharge 85 3 0 85.0 0.9659 1.0 0.9827 + Diabetes 8 0 0 8.0 1.0 1.0 1.0 + Section_Header 94 8 13 108.0 0.9154 0.8711 0.8927 + Age 22 1 0 22.0 0.9429 1.0 0.9706 + Cellular_component 40 21 10 50.0 0.6534 0.8 0.7193 + Hypertension 10 0 0 10.0 1.0 1.0 1.0 + BMI 5 1 1 6.0 0.8333 0.8333 0.8333 + Trial_Phase 13 0 1 14.0 0.9398 0.9286 0.9341 + Employment 98 12 8 107.0 0.8874 0.9206 0.9037 +Statistical_Indic... 76 29 11 88.0 0.7206 0.8689 0.7879 + Time 2 0 1 3.0 1.0 0.6667 0.8 + Total_Cholesterol 14 1 2 17.0 0.9355 0.8529 0.8923 + Drug_Ingredient 327 33 67 395.0 0.9084 0.8281 0.8664 +Physiological_rea... 27 7 14 41.0 0.7864 0.6585 0.7168 + Treatment 66 4 25 92.0 0.9433 0.7228 0.8185 + Vaccine 20 1 2 23.0 0.9531 0.8841 0.9173 +Disease_Syndrome_... 774 70 41 816.0 0.9171 0.9495 0.933 + Virus 121 8 23 144.0 0.9365 0.8403 0.8858 + Frequency 57 1 2 59.9 0.9787 0.9556 0.967 + Route 37 4 10 47.0 0.9024 0.7872 0.8409 + Death_Entity 20 9 3 23.0 0.6897 0.8696 0.7692 + Stage 4 0 7 12.0 1.0 0.3889 0.56 + Vaccine_Name 10 1 0 10.0 0.9091 1.0 0.9524 + Trial_Design 32 13 8 41.0 0.7149 0.7951 0.7529 +Biological_molecules 251 91 53 305.0 0.7335 0.8233 0.7758 + Date 98 5 2 100.0 0.9492 0.98 0.9643 + Race_Ethnicity 0 0 2 2.0 0.0 0.0 0.0 + Gender 46 1 0 46.0 0.9787 1.0 0.9892 + Dosage 49 9 24 73.0 0.8376 0.6712 0.7452 + Viral_components 18 10 15 34.0 0.6512 0.549 0.5957 + +macro - - - - - - 0.8382 +micro - - - - - - 0.8704 +``` \ No newline at end of file From 7cd4129272fcbbdf79b3644360d00c1c16066191 Mon Sep 17 00:00:00 2001 From: Ahmetemintek Date: Wed, 19 Oct 2022 21:07:25 +0700 Subject: [PATCH 02/57] Add model 2022-10-19-ner_jsl_en --- .../Ahmetemintek/2022-10-19-ner_jsl_en.md | 250 ++++++++++++++++++ 1 file changed, 250 insertions(+) create mode 100644 docs/_posts/Ahmetemintek/2022-10-19-ner_jsl_en.md diff --git a/docs/_posts/Ahmetemintek/2022-10-19-ner_jsl_en.md b/docs/_posts/Ahmetemintek/2022-10-19-ner_jsl_en.md new file mode 100644 index 00000000000000..8d1fbe9ce5095f --- /dev/null +++ b/docs/_posts/Ahmetemintek/2022-10-19-ner_jsl_en.md @@ -0,0 +1,250 @@ +--- +layout: model +title: Detect Clinical Entities (ner_jsl) +author: John Snow Labs +name: ner_jsl +date: 2022-10-19 +tags: [ner, licensed, en, clinical] +task: Named Entity Recognition +language: en +edition: Spark NLP for Healthcare 4.2.0 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained named entity recognition deep learning model for clinical terminology. The SparkNLP deep learning model (MedicalNerModel) is inspired by a former state-of-the-art model for NER: Chiu & Nicols, Named Entity Recognition with Bidirectional LSTM-CNN. This model is the official version of jsl_ner_wip_clinical model. + +## Predicted Entities + +`Injury_or_Poisoning`, `Direction`, `Test`, `Admission_Discharge`, `Death_Entity`, `Relationship_Status`, `Duration`, `Respiration`, `Hyperlipidemia`, `Birth_Entity`, `Age`, `Labour_Delivery`, `Family_History_Header`, `BMI`, `Temperature`, `Alcohol`, `Kidney_Disease`, `Oncological`, `Medical_History_Header`, `Cerebrovascular_Disease`, `Oxygen_Therapy`, `O2_Saturation`, `Psychological_Condition`, `Heart_Disease`, `Employment`, `Obesity`, `Disease_Syndrome_Disorder`, `Pregnancy`, `ImagingFindings`, `Procedure`, `Medical_Device`, `Race_Ethnicity`, `Section_Header`, `Symptom`, `Treatment`, `Substance`, `Route`, `Drug_Ingredient`, `Blood_Pressure`, `Diet`, `External_body_part_or_region`, `LDL`, `VS_Finding`, `Allergen`, `EKG_Findings`, `Imaging_Technique`, `Triglycerides`, `RelativeTime`, `Gender`, `Pulse`, `Social_History_Header`, `Substance_Quantity`, `Diabetes`, `Modifier`, `Internal_organ_or_component`, `Clinical_Dept`, `Form`, `Drug_BrandName`, `Strength`, `Fetus_NewBorn`, `RelativeDate`, `Height`, `Test_Result`, `Sexually_Active_or_Sexual_Orientation`, `Frequency`, `Time`, `Weight`, `Vaccine`, `Vaccine_Name`, `Vital_Signs_Header`, `Communicable_Disease`, `Dosage`, `Overweight`, `Hypertension`, `HDL`, `Total_Cholesterol`, `Smoking`, `Date` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/NER_JSL/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/1.Clinical_Named_Entity_Recognition_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_jsl_en_4.2.0_3.0_1666181370373.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") \ + .setInputCols(["document"]) \ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_jsl","en","clinical/models")\ + .setInputCols(["sentence","token","embeddings"])\ + .setOutputCol("ner")\ + .setLabelCasing("upper") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +ner_pipeline = Pipeline(stages=[ + documentAssembler, + sentenceDetector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +empty_data = spark.createDataFrame([[""]]).toDF("text") + +ner_model = ner_pipeline.fit(empty_data) + +data = spark.createDataFrame([["""The patient is a 21-day-old Caucasian male here for 2 days of congestion - mom has been suctioning yellow discharge from the patient's nares, plus she has noticed some mild problems with his breathing while feeding (but negative for any perioral cyanosis or retractions). Additionally, there is no side effect observed after Influenza vaccine. One day ago, mom also noticed a tactile temperature and gave the patient Tylenol. Baby also has had some decreased p.o. intake. His normal breast-feeding is down from 20 minutes q.2h. to 5 to 10 minutes secondary to his respiratory congestion. He sleeps well, but has been more tired and has been fussy over the past 2 days. The parents noticed no improvement with albuterol treatments given in the ER. His urine output has also decreased; normally he has 8 to 10 wet and 5 dirty diapers per 24 hours, now he has down to 4 wet diapers per 24 hours. Mom denies any diarrhea. His bowel movements are yellow colored and soft in nature. +"""]]).toDF("text") + + +result = ner_model.transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val jsl_ner = MedicalNerModel.pretrained("ner_jsl", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("jsl_ner") + +val jsl_ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "jsl_ner")) + .setOutputCol("ner_chunk") + +val jsl_ner_pipeline = new Pipeline().setStages(Array( + documentAssembler, + sentenceDetector, + tokenizer, + embeddings, + jsl_ner, + jsl_ner_converter)) + + +val data = Seq("""The patient is a 21-day-old Caucasian male here for 2 days of congestion - mom has been suctioning yellow discharge from the patient's nares, plus she has noticed some mild problems with his breathing while feeding (but negative for any perioral cyanosis or retractions). Additionally, there is no side effect observed after Influenza vaccine. One day ago, mom also noticed a tactile temperature and gave the patient Tylenol. Baby also has had some decreased p.o. intake. His normal breast-feeding is down from 20 minutes q.2h. to 5 to 10 minutes secondary to his respiratory congestion. He sleeps well, but has been more tired and has been fussy over the past 2 days. The parents noticed no improvement with albuterol treatments given in the ER. His urine output has also decreased; normally he has 8 to 10 wet and 5 dirty diapers per 24 hours, now he has down to 4 wet diapers per 24 hours. Mom denies any diarrhea. His bowel movements are yellow colored and soft in nature.""").toDS.toDF("text") + +val result = jsl_ner_pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| | chunks | begin | end | sentence_id | entities | +|---:|:------------------------------------------|--------:|------:|--------------:|:-----------------------------| +| 0 | 21-day-old | 18 | 27 | 0 | Age | +| 1 | Caucasian | 29 | 37 | 0 | Race_Ethnicity | +| 2 | male | 39 | 42 | 0 | Gender | +| 3 | 2 days | 53 | 58 | 0 | Duration | +| 4 | congestion | 63 | 72 | 0 | Symptom | +| 5 | mom | 76 | 78 | 0 | Gender | +| 6 | suctioning yellow discharge | 89 | 115 | 0 | Symptom | +| 7 | nares | 136 | 140 | 0 | External_body_part_or_region | +| 8 | she | 148 | 150 | 0 | Gender | +| 9 | mild | 169 | 172 | 0 | Modifier | +| 10 | problems with his breathing while feeding | 174 | 214 | 0 | Symptom | +| 11 | perioral cyanosis | 238 | 254 | 0 | Symptom | +| 12 | retractions | 259 | 269 | 0 | Symptom | +| 13 | Influenza vaccine | 326 | 342 | 1 | Vaccine_Name | +| 14 | One day ago | 345 | 355 | 2 | RelativeDate | +| 15 | mom | 358 | 360 | 2 | Gender | +| 16 | tactile temperature | 377 | 395 | 2 | Symptom | +| 17 | Tylenol | 418 | 424 | 2 | Drug_BrandName | +| 18 | Baby | 427 | 430 | 3 | Age | +| 19 | decreased p.o | 450 | 462 | 3 | Symptom | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_jsl| +|Compatibility:|Spark NLP for Healthcare 4.2.0+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|15.2 MB| + +## References + +Trained on data gathered and manually annotated by John Snow Labs. https://www.johnsnowlabs.com/data/ + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + VS_Finding 207.0 37.0 26.0 233.0 0.8484 0.8884 0.8679 + Direction 3642.0 418.0 264.0 3906.0 0.897 0.9324 0.9144 + Respiration 58.0 5.0 4.0 62.0 0.9206 0.9355 0.928 +Cerebrovascular_D... 93.0 20.0 12.0 105.0 0.823 0.8857 0.8532 +Family_History_He... 77.0 1.0 1.0 78.0 0.9872 0.9872 0.9872 + Heart_Disease 453.0 47.0 50.0 503.0 0.906 0.9006 0.9033 + ImagingFindings 85.0 35.0 101.0 186.0 0.7083 0.457 0.5556 + RelativeTime 156.0 36.0 50.0 206.0 0.8125 0.7573 0.7839 + Strength 648.0 24.0 27.0 675.0 0.9643 0.96 0.9621 + Smoking 115.0 8.0 3.0 118.0 0.935 0.9746 0.9544 + Medical_Device 3167.0 368.0 283.0 3450.0 0.8959 0.918 0.9068 + Allergen 1.0 0.0 8.0 9.0 1.0 0.1111 0.2 + EKG_Findings 36.0 13.0 33.0 69.0 0.7347 0.5217 0.6102 + Pulse 119.0 14.0 6.0 125.0 0.8947 0.952 0.9225 +Psychological_Con... 117.0 14.0 15.0 132.0 0.8931 0.8864 0.8897 + Triglycerides 4.0 1.0 0.0 4.0 0.8 1.0 0.8889 + Overweight 3.0 0.0 0.0 3.0 1.0 1.0 1.0 + Obesity 40.0 1.0 1.0 41.0 0.9756 0.9756 0.9756 + Admission_Discharge 307.0 26.0 5.0 312.0 0.9219 0.984 0.9519 + HDL 3.0 0.0 1.0 4.0 1.0 0.75 0.8571 + Diabetes 117.0 3.0 3.0 120.0 0.975 0.975 0.975 + Section_Header 3327.0 103.0 109.0 3436.0 0.97 0.9683 0.9691 + Age 556.0 22.0 31.0 587.0 0.9619 0.9472 0.9545 + O2_Saturation 28.0 3.0 6.0 34.0 0.9032 0.8235 0.8615 + Kidney_Disease 97.0 10.0 19.0 116.0 0.9065 0.8362 0.87 + Test 2603.0 391.0 357.0 2960.0 0.8694 0.8794 0.8744 +Communicable_Disease 22.0 6.0 6.0 28.0 0.7857 0.7857 0.7857 + Hypertension 144.0 5.0 5.0 149.0 0.9664 0.9664 0.9664 +External_body_par... 2401.0 228.0 378.0 2779.0 0.9133 0.864 0.8879 + Oxygen_Therapy 69.0 14.0 10.0 79.0 0.8313 0.8734 0.8519 + Modifier 2229.0 304.0 354.0 2583.0 0.88 0.863 0.8714 + Test_Result 1169.0 165.0 187.0 1356.0 0.8763 0.8621 0.8691 + BMI 5.0 3.0 1.0 6.0 0.625 0.8333 0.7143 + Labour_Delivery 66.0 15.0 17.0 83.0 0.8148 0.7952 0.8049 + Employment 220.0 16.0 37.0 257.0 0.9322 0.856 0.8925 + Fetus_NewBorn 53.0 16.0 23.0 76.0 0.7681 0.6974 0.731 + Clinical_Dept 843.0 69.0 53.0 896.0 0.9243 0.9408 0.9325 + Time 28.0 8.0 11.0 39.0 0.7778 0.7179 0.7467 + Procedure 2893.0 326.0 307.0 3200.0 0.8987 0.9041 0.9014 + Diet 29.0 3.0 18.0 47.0 0.9063 0.617 0.7342 + Oncological 419.0 41.0 36.0 455.0 0.9109 0.9209 0.9158 + LDL 3.0 0.0 1.0 4.0 1.0 0.75 0.8571 + Symptom 6559.0 876.0 908.0 7467.0 0.8822 0.8784 0.8803 + Temperature 86.0 7.0 3.0 89.0 0.9247 0.9663 0.9451 + Vital_Signs_Header 191.0 25.0 19.0 210.0 0.8843 0.9095 0.8967 + Total_Cholesterol 13.0 3.0 7.0 20.0 0.8125 0.65 0.7222 + Relationship_Status 52.0 5.0 2.0 54.0 0.9123 0.963 0.9369 + Blood_Pressure 132.0 15.0 11.0 143.0 0.898 0.9231 0.9103 + Injury_or_Poisoning 500.0 64.0 86.0 586.0 0.8865 0.8532 0.8696 + Drug_Ingredient 1505.0 128.0 91.0 1596.0 0.9216 0.943 0.9322 + Treatment 134.0 21.0 25.0 159.0 0.8645 0.8428 0.8535 + Pregnancy 89.0 23.0 20.0 109.0 0.7946 0.8165 0.8054 + Vaccine 7.0 2.0 2.0 9.0 0.7778 0.7778 0.7778 +Disease_Syndrome_... 2684.0 383.0 344.0 3028.0 0.8751 0.8864 0.8807 + Height 22.0 3.0 1.0 23.0 0.88 0.9565 0.9167 + Frequency 604.0 74.0 67.0 671.0 0.8909 0.9001 0.8955 + Route 783.0 89.0 64.0 847.0 0.8979 0.9244 0.911 + Duration 352.0 83.0 41.0 393.0 0.8092 0.8957 0.8502 + Death_Entity 41.0 3.0 3.0 44.0 0.9318 0.9318 0.9318 +Internal_organ_or... 5915.0 811.0 713.0 6628.0 0.8794 0.8924 0.8859 + Vaccine_Name 5.0 0.0 3.0 8.0 1.0 0.625 0.7692 + Alcohol 72.0 4.0 6.0 78.0 0.9474 0.9231 0.9351 + Substance_Quantity 3.0 4.0 0.0 3.0 0.4286 1.0 0.6 + Date 544.0 26.0 17.0 561.0 0.9544 0.9697 0.962 + Hyperlipidemia 44.0 4.0 0.0 44.0 0.9167 1.0 0.9565 +Social_History_He... 93.0 3.0 4.0 97.0 0.9688 0.9588 0.9637 + Imaging_Technique 59.0 4.0 31.0 90.0 0.9365 0.6556 0.7712 + Race_Ethnicity 113.0 0.0 0.0 113.0 1.0 1.0 1.0 + Drug_BrandName 819.0 53.0 41.0 860.0 0.9392 0.9523 0.9457 + RelativeDate 530.0 86.0 89.0 619.0 0.8604 0.8562 0.8583 + Gender 5414.0 55.0 47.0 5461.0 0.9899 0.9914 0.9907 + Form 204.0 24.0 35.0 239.0 0.8947 0.8536 0.8737 + Dosage 211.0 21.0 48.0 259.0 0.9095 0.8147 0.8595 +Medical_History_H... 105.0 7.0 2.0 107.0 0.9375 0.9813 0.9589 + Birth_Entity 4.0 0.0 2.0 6.0 1.0 0.6667 0.8 + Substance 72.0 14.0 12.0 84.0 0.8372 0.8571 0.8471 +Sexually_Active_o... 7.0 0.0 0.0 7.0 1.0 1.0 1.0 + Weight 77.0 8.0 11.0 88.0 0.9059 0.875 0.8902 + macro - - - - - - 0.8674 + micro - - - - - - 0.9054 +``` \ No newline at end of file From 808aeca7b8d9e44dc5a78c43c3d3834148e08b65 Mon Sep 17 00:00:00 2001 From: HashamUlHaq Date: Tue, 25 Oct 2022 11:02:26 +0700 Subject: [PATCH 03/57] Add model 2022-10-25-t5_base_pubmedqa_en --- .../2022-10-25-t5_base_pubmedqa_en.md | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 docs/_posts/HashamUlHaq/2022-10-25-t5_base_pubmedqa_en.md diff --git a/docs/_posts/HashamUlHaq/2022-10-25-t5_base_pubmedqa_en.md b/docs/_posts/HashamUlHaq/2022-10-25-t5_base_pubmedqa_en.md new file mode 100644 index 00000000000000..d0e4ccbe250399 --- /dev/null +++ b/docs/_posts/HashamUlHaq/2022-10-25-t5_base_pubmedqa_en.md @@ -0,0 +1,89 @@ +--- +layout: model +title: T5 Clinical Summarization / QA model +author: John Snow Labs +name: t5_base_pubmedqa +date: 2022-10-25 +tags: [t5, licensed, clinical, en] +task: Summarization +language: en +edition: Spark NLP for Healthcare 4.1.0 +spark_version: [3.0, 3.2] +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +The T5 transformer model described in the seminal paper “Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer” can perform a variety of tasks, such as text summarization, question answering and translation. More details about using the model can be found in the paper (https://arxiv.org/pdf/1910.10683.pdf). This model is specifically trained on medical data for text summarization and question answering. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/t5_base_pubmedqa_en_4.1.0_3.2_1666670271455.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ +.setInputCol("text")\ +.setOutputCol("documents") + +t5 = T5Transformer().pretrained("t5_base_pubmedqa", "en", "clinical/models") \ +.setInputCols(["documents"]) \ +.setOutputCol("t5_output")\ +.setTask("summarize medical questions:")\ +.setMaxOutputLength(200) + +pipeline = Pipeline(stages=[ +document_assembler, +sentence_detector, +t5 +]) +pipeline = Pipeline(stages=[ +document_assembler, +sentence_detector, +t5 +]) +data = spark.createDataFrame([ +[1, "content:SUBJECT: Normal physical traits but no period MESSAGE: I'm a 40 yr. old woman that has infantile reproductive organs and have never experienced a mensus. I have had Doctors look but they all say I just have infantile female reproductive organs. When I try to look for answers on the internet I cannot find anything. ALL my \"girly\" parts are normal. My organs never matured. Could you give me more information please. focus:all"] +]).toDF('id', 'text') +results = pipeline.fit(data).transform(data) +results.select("t5_output.result").show(truncate=False) +``` + +
+ +## Results + +```bash +I have a normal physical appearance and have no mensus. Can you give me more information? +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|t5_base_pubmedqa| +|Compatibility:|Spark NLP for Healthcare 4.1.0+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[documents]| +|Output Labels:|[t5]| +|Language:|en| +|Size:|916.7 MB| + +## References + +Trained on Pubmed data & qnli \ No newline at end of file From 5054169c73a00a31c20a5dd738a27ac4d7bce75c Mon Sep 17 00:00:00 2001 From: mauro-nievoff Date: Wed, 26 Oct 2022 00:25:08 +0700 Subject: [PATCH 04/57] Add model 2022-10-25-ner_oncology_en --- .../2022-10-25-ner_oncology_en.md | 221 ++++++++++++++++++ 1 file changed, 221 insertions(+) create mode 100644 docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_en.md diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_en.md new file mode 100644 index 00000000000000..9af839414e4a68 --- /dev/null +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_en.md @@ -0,0 +1,221 @@ +--- +layout: model +title: Detect Oncology-Specific Entities +author: John Snow Labs +name: ner_oncology +date: 2022-10-25 +tags: [licensed, clinical, oncology, en, ner, biomarker, treatment] +task: Named Entity Recognition +language: en +edition: Spark NLP for Healthcare 4.0.0 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts more than 40 oncology-related entities, including therapies, tests and staging. + +## Predicted Entities + +`Histological_Type`, `Direction`, `Staging`, `Cancer_Score`, `Imaging_Test`, `Cycle_Number`, `Tumor_Finding`, `Site_Lymph_Node`, `Invasion`, `Response_To_Treatment`, `Smoking_Status`, `Tumor_Size`, `Cycle_Count`, `Adenopathy`, `Age`, `Biomarker_Result`, `Unspecific_Therapy`, `Site_Breast`, `Chemotherapy`, `Targeted_Therapy`, `Radiotherapy`, `Performance_Status`, `Pathology_Test`, `Site_Other_Body_Part`, `Cancer_Surgery`, `Line_Of_Therapy`, `Pathology_Result`, `Hormonal_Therapy`, `Site_Bone`, `Biomarker`, `Immunotherapy`, `Cycle_Day`, `Frequency`, `Route`, `Duration`, `Death_Entity`, `Metastasis`, `Site_Liver`, `Cancer_Dx`, `Grade`, `Date`, `Site_Lung`, `Site_Brain`, `Relative_Date`, `Race_Ethnicity`, `Gender`, `Oncogene`, `Dosage`, `Radiation_Dose` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_en_4.0.0_3.0_1666718178718.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["The had previously undergone a left mastectomy and an axillary lymph node dissection for a left breast cancer twenty years ago. +The tumor was positive for ER and PR. Postoperatively, radiotherapy was administered to the residual breast. +The cancer recurred as a right lung metastasis 13 years later. The patient underwent a regimen consisting of adriamycin (60 mg/m2) and cyclophosphamide (600 mg/m2) over six courses, as first line therapy."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("The had previously undergone a left mastectomy and an axillary lymph node dissection for a left breast cancer twenty years ago. +The tumor was positive for ER and PR. Postoperatively, radiotherapy was administered to the residual breast. +The cancer recurred as a right lung metastasis 13 years later. The patient underwent a regimen consisting of adriamycin (60 mg/m2) and cyclophosphamide (600 mg/m2) over six courses, as first line therapy.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) + +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:-------------------------------|:----------------------| +| left | Direction | +| mastectomy | Cancer_Surgery | +| axillary lymph node dissection | Cancer_Surgery | +| left | Direction | +| breast cancer | Cancer_Dx | +| twenty years ago | Relative_Date | +| tumor | Tumor_Finding | +| positive | Biomarker_Result | +| ER | Biomarker | +| PR | Biomarker | +| radiotherapy | Radiotherapy | +| breast | Site_Breast | +| cancer | Cancer_Dx | +| recurred | Response_To_Treatment | +| right | Direction | +| lung | Site_Lung | +| metastasis | Metastasis | +| 13 years later | Relative_Date | +| adriamycin | Chemotherapy | +| 60 mg/m2 | Dosage | +| cyclophosphamide | Chemotherapy | +| 600 mg/m2 | Dosage | +| six courses | Cycle_Count | +| first line | Line_Of_Therapy | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology| +|Compatibility:|Spark NLP for Healthcare 4.0.0+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.6 MB| +|Dependencies:|embeddings_clinical| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + Histological_Type 339.0 75.0 114.0 453.0 0.82 0.75 0.78 + Direction 832.0 163.0 152.0 984.0 0.84 0.85 0.84 + Staging 229.0 31.0 29.0 258.0 0.88 0.89 0.88 + Cancer_Score 37.0 8.0 25.0 62.0 0.82 0.60 0.69 + Imaging_Test 2027.0 214.0 177.0 2204.0 0.90 0.92 0.91 + Cycle_Number 73.0 29.0 24.0 97.0 0.72 0.75 0.73 + Tumor_Finding 1114.0 64.0 143.0 1257.0 0.95 0.89 0.91 + Site_Lymph_Node 491.0 53.0 60.0 551.0 0.90 0.89 0.90 + Invasion 158.0 36.0 23.0 181.0 0.81 0.87 0.84 +Response_To_Treatment 431.0 149.0 165.0 596.0 0.74 0.72 0.73 + Smoking_Status 66.0 18.0 2.0 68.0 0.79 0.97 0.87 + Tumor_Size 1050.0 112.0 79.0 1129.0 0.90 0.93 0.92 + Cycle_Count 177.0 62.0 53.0 230.0 0.74 0.77 0.75 + Adenopathy 67.0 12.0 29.0 96.0 0.85 0.70 0.77 + Age 930.0 33.0 19.0 949.0 0.97 0.98 0.97 + Biomarker_Result 1160.0 169.0 285.0 1445.0 0.87 0.80 0.84 + Unspecific_Therapy 198.0 86.0 80.0 278.0 0.70 0.71 0.70 + Site_Breast 125.0 15.0 22.0 147.0 0.89 0.85 0.87 + Chemotherapy 814.0 55.0 65.0 879.0 0.94 0.93 0.93 + Targeted_Therapy 195.0 27.0 33.0 228.0 0.88 0.86 0.87 + Radiotherapy 276.0 29.0 34.0 310.0 0.90 0.89 0.90 + Performance_Status 121.0 17.0 14.0 135.0 0.88 0.90 0.89 + Pathology_Test 888.0 296.0 162.0 1050.0 0.75 0.85 0.79 + Site_Other_Body_Part 909.0 275.0 592.0 1501.0 0.77 0.61 0.68 + Cancer_Surgery 693.0 119.0 126.0 819.0 0.85 0.85 0.85 + Line_Of_Therapy 101.0 11.0 5.0 106.0 0.90 0.95 0.93 + Pathology_Result 655.0 279.0 487.0 1142.0 0.70 0.57 0.63 + Hormonal_Therapy 169.0 4.0 16.0 185.0 0.98 0.91 0.94 + Site_Bone 264.0 81.0 49.0 313.0 0.77 0.84 0.80 + Biomarker 1259.0 238.0 256.0 1515.0 0.84 0.83 0.84 + Immunotherapy 103.0 47.0 25.0 128.0 0.69 0.80 0.74 + Cycle_Day 200.0 36.0 48.0 248.0 0.85 0.81 0.83 + Frequency 354.0 27.0 73.0 427.0 0.93 0.83 0.88 + Route 91.0 15.0 22.0 113.0 0.86 0.81 0.83 + Duration 625.0 161.0 136.0 761.0 0.80 0.82 0.81 + Death_Entity 34.0 2.0 4.0 38.0 0.94 0.89 0.92 + Metastasis 353.0 18.0 17.0 370.0 0.95 0.95 0.95 + Site_Liver 189.0 64.0 45.0 234.0 0.75 0.81 0.78 + Cancer_Dx 1301.0 103.0 93.0 1394.0 0.93 0.93 0.93 + Grade 190.0 27.0 46.0 236.0 0.88 0.81 0.84 + Date 807.0 21.0 24.0 831.0 0.97 0.97 0.97 + Site_Lung 469.0 110.0 90.0 559.0 0.81 0.84 0.82 + Site_Brain 221.0 64.0 58.0 279.0 0.78 0.79 0.78 + Relative_Date 1211.0 401.0 111.0 1322.0 0.75 0.92 0.83 + Race_Ethnicity 57.0 8.0 5.0 62.0 0.88 0.92 0.90 + Gender 1247.0 17.0 7.0 1254.0 0.99 0.99 0.99 + Oncogene 345.0 83.0 104.0 449.0 0.81 0.77 0.79 + Dosage 900.0 30.0 160.0 1060.0 0.97 0.85 0.90 + Radiation_Dose 108.0 5.0 18.0 126.0 0.96 0.86 0.90 + macro_avg 24653.0 3999.0 4406.0 29059.0 0.85 0.84 0.84 + micro_avg NaN NaN NaN NaN 0.86 0.85 0.85 +``` \ No newline at end of file From 29512d1b66bcc3ac9ed9a3df3c8d7ad410a83cdb Mon Sep 17 00:00:00 2001 From: mauro-nievoff Date: Wed, 26 Oct 2022 00:39:17 +0700 Subject: [PATCH 05/57] Add model 2022-10-25-ner_oncology_therapy_en --- .../2022-10-25-ner_oncology_therapy_en.md | 175 ++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_therapy_en.md diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_therapy_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_therapy_en.md new file mode 100644 index 00000000000000..15b311fa773fb3 --- /dev/null +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_therapy_en.md @@ -0,0 +1,175 @@ +--- +layout: model +title: Detect Entities Related to Cancer Therapies +author: John Snow Labs +name: ner_oncology_therapy +date: 2022-10-25 +tags: [licensed, clinical, oncology, en, ner, treatment] +task: Named Entity Recognition +language: en +edition: Spark NLP for Healthcare 4.0.0 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts entities related to oncology therapies using granular labels, including mentions of treatments, posology information and line of therapy. + +## Predicted Entities + +`Cycle_Number`, `Response_To_Treatment`, `Cycle_Count`, `Unspecific_Therapy`, `Chemotherapy`, `Targeted_Therapy`, `Radiotherapy`, `Cancer_Surgery`, `Line_Of_Therapy`, `Hormonal_Therapy`, `Immunotherapy`, `Cycle_Day`, `Frequency`, `Route`, `Duration`, `Dosage`, `Radiation_Dose` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_therapy_en_4.0.0_3.0_1666718855759.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_therapy", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["The had previously undergone a left mastectomy and an axillary lymph node dissection for a left breast cancer twenty years ago. +The tumor was positive for ER and PR. Postoperatively, radiotherapy was administered to her breast. +The cancer recurred as a right lung metastasis 13 years later. The patient underwent a regimen consisting of adriamycin (60 mg/m2) and cyclophosphamide (600 mg/m2) over six courses, as first line therapy."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_therapy", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("The had previously undergone a left mastectomy and an axillary lymph node dissection for a left breast cancer twenty years ago. +The tumor was positive for ER and PR. Postoperatively, radiotherapy was administered to her breast. +The cancer recurred as a right lung metastasis 13 years later. The patient underwent a regimen consisting of adriamycin (60 mg/m2) and cyclophosphamide (600 mg/m2) over six courses, as first line therapy.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) + +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:-------------------------------|:----------------------| +| mastectomy | Cancer_Surgery | +| axillary lymph node dissection | Cancer_Surgery | +| radiotherapy | Radiotherapy | +| recurred | Response_To_Treatment | +| adriamycin | Chemotherapy | +| 60 mg/m2 | Dosage | +| cyclophosphamide | Chemotherapy | +| 600 mg/m2 | Dosage | +| six courses | Cycle_Count | +| first line | Line_Of_Therapy | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_therapy| +|Compatibility:|Spark NLP for Healthcare 4.0.0+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.4 MB| +|Dependencies:|embeddings_clinical| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + Cycle_Number 78.0 41.0 19.0 97.0 0.66 0.80 0.72 +Response_To_Treatment 451.0 205.0 145.0 596.0 0.69 0.76 0.72 + Cycle_Count 210.0 75.0 20.0 230.0 0.74 0.91 0.82 + Unspecific_Therapy 189.0 76.0 89.0 278.0 0.71 0.68 0.70 + Chemotherapy 831.0 87.0 48.0 879.0 0.91 0.95 0.92 + Targeted_Therapy 194.0 28.0 34.0 228.0 0.87 0.85 0.86 + Radiotherapy 279.0 35.0 31.0 310.0 0.89 0.90 0.89 + Cancer_Surgery 720.0 192.0 99.0 819.0 0.79 0.88 0.83 + Line_Of_Therapy 95.0 6.0 11.0 106.0 0.94 0.90 0.92 + Hormonal_Therapy 170.0 6.0 15.0 185.0 0.97 0.92 0.94 + Immunotherapy 96.0 17.0 32.0 128.0 0.85 0.75 0.80 + Cycle_Day 205.0 38.0 43.0 248.0 0.84 0.83 0.84 + Frequency 363.0 33.0 64.0 427.0 0.92 0.85 0.88 + Route 93.0 6.0 20.0 113.0 0.94 0.82 0.88 + Duration 527.0 102.0 234.0 761.0 0.84 0.69 0.76 + Dosage 959.0 63.0 101.0 1060.0 0.94 0.90 0.92 + Radiation_Dose 106.0 12.0 20.0 126.0 0.90 0.84 0.87 + macro_avg 5566.0 1022.0 1025.0 6591.0 0.85 0.84 0.84 + micro_avg NaN NaN NaN NaN 0.85 0.84 0.84 +``` \ No newline at end of file From fa4e6ff51e9e4c34c77989dc0dca5a4a5df1ad89 Mon Sep 17 00:00:00 2001 From: mauro-nievoff Date: Wed, 26 Oct 2022 00:47:13 +0700 Subject: [PATCH 06/57] Add model 2022-10-25-ner_oncology_diagnosis_en --- .../2022-10-25-ner_oncology_diagnosis_en.md | 163 ++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_diagnosis_en.md diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_diagnosis_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_diagnosis_en.md new file mode 100644 index 00000000000000..82dcc17203a594 --- /dev/null +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_diagnosis_en.md @@ -0,0 +1,163 @@ +--- +layout: model +title: Detect Entities Related to Cancer Diagnosis +author: John Snow Labs +name: ner_oncology_diagnosis +date: 2022-10-25 +tags: [licensed, clinical, oncology, en, ner] +task: Named Entity Recognition +language: en +edition: Spark NLP for Healthcare 4.0.0 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts entities related to cancer diagnosis, such as Metastasis, Histological_Type or Invasion. + +## Predicted Entities + +`Histological_Type`, `Staging`, `Cancer_Score`, `Tumor_Finding`, `Invasion`, `Tumor_Size`, `Adenopathy`, `Performance_Status`, `Pathology_Result`, `Metastasis`, `Cancer_Dx`, `Grade` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_diagnosis_en_4.0.0_3.0_1666719602276.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_diagnosis", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["Two years ago, the patient presented with a tumor in her left breast and adenopathies. She was diagnosed with invasive ductal carcinoma. +Last week she was also found to have a lung metastasis."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_diagnosis", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("Two years ago, the patient presented with a tumor in her left breast and adenopathies. She was diagnosed with invasive ductal carcinoma. +Last week she was also found to have a lung metastasis.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:-------------|:------------------| +| tumor | Tumor_Finding | +| adenopathies | Adenopathy | +| invasive | Histological_Type | +| ductal | Histological_Type | +| carcinoma | Cancer_Dx | +| metastasis | Metastasis | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_diagnosis| +|Compatibility:|Spark NLP for Healthcare 4.0.0+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.3 MB| +|Dependencies:|embeddings_clinical| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + Histological_Type 354.0 63.0 99.0 453.0 0.85 0.78 0.81 + Staging 234.0 27.0 24.0 258.0 0.90 0.91 0.90 + Cancer_Score 36.0 15.0 26.0 62.0 0.71 0.58 0.64 + Tumor_Finding 1121.0 83.0 136.0 1257.0 0.93 0.89 0.91 + Invasion 154.0 27.0 27.0 181.0 0.85 0.85 0.85 + Tumor_Size 1058.0 126.0 71.0 1129.0 0.89 0.94 0.91 + Adenopathy 66.0 10.0 30.0 96.0 0.87 0.69 0.77 +Performance_Status 116.0 15.0 19.0 135.0 0.89 0.86 0.87 + Pathology_Result 852.0 686.0 290.0 1142.0 0.55 0.75 0.64 + Metastasis 356.0 15.0 14.0 370.0 0.96 0.96 0.96 + Cancer_Dx 1302.0 88.0 92.0 1394.0 0.94 0.93 0.94 + Grade 201.0 23.0 35.0 236.0 0.90 0.85 0.87 + macro_avg 5850.0 1178.0 863.0 6713.0 0.85 0.83 0.84 + micro_avg NaN NaN NaN NaN 0.85 0.87 0.86 +``` \ No newline at end of file From 4e38f261f36dd9547e79abc70ecb02110640a079 Mon Sep 17 00:00:00 2001 From: mauro-nievoff Date: Wed, 26 Oct 2022 00:53:31 +0700 Subject: [PATCH 07/57] Add model 2022-10-25-ner_oncology_tnm_en --- .../2022-10-25-ner_oncology_tnm_en.md | 156 ++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_tnm_en.md diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_tnm_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_tnm_en.md new file mode 100644 index 00000000000000..e9779d71d1b040 --- /dev/null +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_tnm_en.md @@ -0,0 +1,156 @@ +--- +layout: model +title: Extract Entities Related to TNM Staging +author: John Snow Labs +name: ner_oncology_tnm +date: 2022-10-25 +tags: [licensed, clinical, oncology, en, ner] +task: Named Entity Recognition +language: en +edition: Spark NLP for Healthcare 4.0.0 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts staging information and mentions related to tumors, lymph nodes and metastases. + +## Predicted Entities + +`Lymph_Node`, `Staging`, `Lymph_Node_Modifier`, `Tumor_Description`, `Tumor`, `Metastasis`, `Cancer_Dx` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_tnm_en_4.0.0_3.0_1666720053687.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_tnm", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["The final diagnosis was metastatic breast carcinoma, and it was classified as T2N1M1 stage IV. The histological grade of this 4 cm tumor was grade 2."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_tnm", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("The final diagnosis was metastatic breast carcinoma, and it was classified as T2N1M1 stage IV. The histological grade of this 4 cm tumor was grade 2.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:-----------------|:------------------| +| metastatic | Metastasis | +| breast carcinoma | Cancer_Dx | +| T2N1M1 stage IV | Staging | +| 4 cm | Tumor_Description | +| tumor | Tumor | +| grade 2 | Tumor_Description | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_tnm| +|Compatibility:|Spark NLP for Healthcare 4.0.0+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.2 MB| +|Dependencies:|embeddings_clinical| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + Lymph_Node 570.0 77.0 77.0 647.0 0.88 0.88 0.88 + Staging 232.0 22.0 26.0 258.0 0.91 0.90 0.91 +Lymph_Node_Modifier 30.0 5.0 5.0 35.0 0.86 0.86 0.86 + Tumor_Description 2651.0 581.0 490.0 3141.0 0.82 0.84 0.83 + Tumor 1116.0 72.0 141.0 1257.0 0.94 0.89 0.91 + Metastasis 358.0 15.0 12.0 370.0 0.96 0.97 0.96 + Cancer_Dx 1302.0 87.0 92.0 1394.0 0.94 0.93 0.94 + macro_avg 6259.0 859.0 843.0 7102.0 0.90 0.90 0.90 + micro_avg NaN NaN NaN NaN 0.88 0.88 0.88 +``` \ No newline at end of file From c4d5544e0f93ad168d736ebe7fa3dfebd222aa57 Mon Sep 17 00:00:00 2001 From: mauro-nievoff Date: Wed, 26 Oct 2022 01:00:17 +0700 Subject: [PATCH 08/57] Add model 2022-10-25-ner_oncology_anatomy_general_en --- ...2-10-25-ner_oncology_anatomy_general_en.md | 151 ++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_anatomy_general_en.md diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_anatomy_general_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_anatomy_general_en.md new file mode 100644 index 00000000000000..12a24624d94560 --- /dev/null +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_anatomy_general_en.md @@ -0,0 +1,151 @@ +--- +layout: model +title: Extract Anatomical Entities from Oncology Texts +author: John Snow Labs +name: ner_oncology_anatomy_general +date: 2022-10-25 +tags: [licensed, clinical, oncology, en, ner, anatomy] +task: Named Entity Recognition +language: en +edition: Spark NLP for Healthcare 4.0.0 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts anatomical entities using an unspecific label. + +## Predicted Entities + +`Anatomical_Site`, `Direction` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_anatomy_general_en_4.0.0_3.0_1666720431299.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_anatomy_general", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["The patient presented a mass in her left breast, and a possible metastasis in her lungs and in her liver."]]).toDF("text") + +result = pipeline.fit(data).transform(data) + +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_anatomy_general", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("The patient presented a mass in her left breast, and a possible metastasis in her lungs and in her liver.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) + +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:--------|:----------------| +| left | Direction | +| breast | Anatomical_Site | +| lungs | Anatomical_Site | +| liver | Anatomical_Site | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_anatomy_general| +|Compatibility:|Spark NLP for Healthcare 4.0.0+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.3 MB| +|Dependencies:|embeddings_clinical| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 +Anatomical_Site 2946.0 549.0 638.0 3584.0 0.84 0.82 0.83 + Direction 864.0 209.0 120.0 984.0 0.81 0.88 0.84 + macro_avg 3810.0 758.0 758.0 4568.0 0.82 0.85 0.84 + micro_avg NaN NaN NaN NaN 0.83 0.83 0.83 +``` \ No newline at end of file From fad154d7c8b6cabb73023ac5b947c909da80b1ec Mon Sep 17 00:00:00 2001 From: mauro-nievoff Date: Wed, 26 Oct 2022 01:15:44 +0700 Subject: [PATCH 09/57] Add model 2022-10-25-ner_oncology_demographics_en --- ...2022-10-25-ner_oncology_demographics_en.md | 150 ++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_demographics_en.md diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_demographics_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_demographics_en.md new file mode 100644 index 00000000000000..cc60b3ecf53588 --- /dev/null +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_demographics_en.md @@ -0,0 +1,150 @@ +--- +layout: model +title: Extract Demographic Entities from Oncology Texts +author: John Snow Labs +name: ner_oncology_demographics +date: 2022-10-25 +tags: [licensed, clinical, oncology, en, ner, demographics] +task: Named Entity Recognition +language: en +edition: Spark NLP for Healthcare 4.0.0 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts demographic information from oncology texts, including age, gender and smoking status. + +## Predicted Entities + +`Smoking_Status`, `Age`, `Race_Ethnicity`, `Gender` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_demographics_en_4.0.0_3.0_1666720851983.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_demographics", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["The patient is a 40-year-old man with history of heavy smoking."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_demographics", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("The patient is a 40-year-old man with history of heavy smoking.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:------------|:---------------| +| 40-year-old | Age | +| man | Gender | +| smoking | Smoking_Status | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_demographics| +|Compatibility:|Spark NLP for Healthcare 4.0.0+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.6 MB| +|Dependencies:|embeddings_clinical| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 +Smoking_Status 60.0 19.0 8.0 68.0 0.76 0.88 0.82 + Age 934.0 33.0 15.0 949.0 0.97 0.98 0.97 +Race_Ethnicity 57.0 5.0 5.0 62.0 0.92 0.92 0.92 + Gender 1248.0 18.0 6.0 1254.0 0.99 1.00 0.99 + macro_avg 2299.0 75.0 34.0 2333.0 0.91 0.95 0.93 + micro_avg NaN NaN NaN NaN 0.97 0.99 0.98 +``` \ No newline at end of file From f19d3470695da4a03a2aa3a9be593ce8ef4625cd Mon Sep 17 00:00:00 2001 From: mauro-nievoff Date: Wed, 26 Oct 2022 01:23:11 +0700 Subject: [PATCH 10/57] Add model 2022-10-25-ner_oncology_test_en --- .../2022-10-25-ner_oncology_test_en.md | 152 ++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_test_en.md diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_test_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_test_en.md new file mode 100644 index 00000000000000..a01865da5ea3c8 --- /dev/null +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_test_en.md @@ -0,0 +1,152 @@ +--- +layout: model +title: Extract Oncology Tests +author: John Snow Labs +name: ner_oncology_test +date: 2022-10-25 +tags: [licensed, clinical, oncology, en, ner, test] +task: Named Entity Recognition +language: en +edition: Spark NLP for Healthcare 4.0.0 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts mentions of tests from oncology texts, including pathology tests and imaging tests. + +## Predicted Entities + +`Imaging_Test`, `Biomarker_Result`, `Pathology_Test`, `Biomarker`, `Oncogene` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_test_en_4.0.0_3.0_1666721761945.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_test", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["A biopsy was conducted using an ultrasound guided thick-needle. His chest computed tomography (CT) scan was negative."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_test", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("A biopsy was conducted using an ultrasound guided thick-needle. His chest computed tomography (CT) scan was negative.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:-------------------------------|:---------------| +| biopsy | Pathology_Test | +| ultrasound guided thick-needle | Pathology_Test | +| chest computed tomography | Imaging_Test | +| CT | Imaging_Test | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_test| +|Compatibility:|Spark NLP for Healthcare 4.0.0+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.2 MB| +|Dependencies:|embeddings_clinical| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + Imaging_Test 2020.0 229.0 184.0 2204.0 0.90 0.92 0.91 +Biomarker_Result 1177.0 186.0 268.0 1445.0 0.86 0.81 0.84 + Pathology_Test 888.0 276.0 162.0 1050.0 0.76 0.85 0.80 + Biomarker 1287.0 254.0 228.0 1515.0 0.84 0.85 0.84 + Oncogene 365.0 89.0 84.0 449.0 0.80 0.81 0.81 + macro_avg 5737.0 1034.0 926.0 6663.0 0.83 0.85 0.84 + micro_avg NaN NaN NaN NaN 0.85 0.86 0.85 +``` \ No newline at end of file From 9e7f69caf26be1c73374f1922f3dc697512af592 Mon Sep 17 00:00:00 2001 From: mauro-nievoff Date: Wed, 26 Oct 2022 01:29:32 +0700 Subject: [PATCH 11/57] Add model 2022-10-25-ner_oncology_unspecific_posology_en --- ...-25-ner_oncology_unspecific_posology_en.md | 154 ++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_unspecific_posology_en.md diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_unspecific_posology_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_unspecific_posology_en.md new file mode 100644 index 00000000000000..833d8a6a31304b --- /dev/null +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_unspecific_posology_en.md @@ -0,0 +1,154 @@ +--- +layout: model +title: Extract Cancer Therapies and Posology Information +author: John Snow Labs +name: ner_oncology_unspecific_posology +date: 2022-10-25 +tags: [licensed, clinical, oncology, en, ner, treatment, posology] +task: Named Entity Recognition +language: en +edition: Spark NLP for Healthcare 4.0.0 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts mentions of treatments and posology information using unspecific labels (low granularity). + +## Predicted Entities + +`Posology_Information`, `Cancer_Therapy` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_unspecific_posology_en_4.0.0_3.0_1666722206468.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_unspecific_posology", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["The patient underwent a regimen consisting of adriamycin (60 mg/m2) and cyclophosphamide (600 mg/m2) over six courses. She is currently receiving his second cycle of chemotherapy and is in good overall condition."]]).toDF("text") + +result = pipeline.fit(data).transform(data) + +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_unspecific_posology", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("The patient underwent a regimen consisting of adriamycin (60 mg/m2) and cyclophosphamide (600 mg/m2) over six courses. She is currently receiving his second cycle of chemotherapy and is in good overall condition.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:-----------------|:---------------------| +| adriamycin | Cancer_Therapy | +| 60 mg/m2 | Posology_Information | +| cyclophosphamide | Cancer_Therapy | +| 600 mg/m2 | Posology_Information | +| over six courses | Posology_Information | +| second cycle | Posology_Information | +| chemotherapy | Cancer_Therapy | + +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_unspecific_posology| +|Compatibility:|Spark NLP for Healthcare 4.0.0+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.3 MB| +|Dependencies:|embeddings_clinical| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 +Posology_Information 2663.0 244.0 399.0 3062.0 0.92 0.87 0.89 + Cancer_Therapy 2580.0 317.0 247.0 2827.0 0.89 0.91 0.90 + macro_avg 5243.0 561.0 646.0 5889.0 0.90 0.89 0.90 + micro_avg NaN NaN NaN NaN 0.90 0.89 0.90 +``` \ No newline at end of file From b18986491e5553b3fcbea2ce3212d5177f645f8b Mon Sep 17 00:00:00 2001 From: mauro-nievoff Date: Wed, 26 Oct 2022 01:35:45 +0700 Subject: [PATCH 12/57] Add model 2022-10-25-ner_oncology_anatomy_granular_en --- ...-10-25-ner_oncology_anatomy_granular_en.md | 155 ++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_anatomy_granular_en.md diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_anatomy_granular_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_anatomy_granular_en.md new file mode 100644 index 00000000000000..09ca038910aa19 --- /dev/null +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_anatomy_granular_en.md @@ -0,0 +1,155 @@ +--- +layout: model +title: Extract Granular Anatomical Entities from Oncology Texts +author: John Snow Labs +name: ner_oncology_anatomy_granular +date: 2022-10-25 +tags: [licensed, clinical, oncology, en, ner, anatomy] +task: Named Entity Recognition +language: en +edition: Spark NLP for Healthcare 4.0.0 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extractions mentions of anatomical entities using granular labels. + +## Predicted Entities + +`Direction`, `Site_Lymph_Node`, `Site_Breast`, `Site_Other_Body_Part`, `Site_Bone`, `Site_Liver`, `Site_Lung`, `Site_Brain` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_anatomy_granular_en_4.0.0_3.0_1666722590194.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_anatomy_granular", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["The patient presented a mass in her left breast, and a possible metastasis in her lungs and in her liver."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_anatomy_granular", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("The patient presented a mass in her left breast, and a possible metastasis in her lungs and in her liver.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:--------|:------------| +| left | Direction | +| breast | Site_Breast | +| lungs | Site_Lung | +| liver | Site_Liver | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_anatomy_granular| +|Compatibility:|Spark NLP for Healthcare 4.0.0+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.3 MB| +|Dependencies:|embeddings_clinical| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + Direction 822.0 221.0 162.0 984.0 0.79 0.84 0.81 + Site_Lymph_Node 481.0 38.0 70.0 551.0 0.93 0.87 0.90 + Site_Breast 88.0 14.0 59.0 147.0 0.86 0.60 0.71 +Site_Other_Body_Part 604.0 184.0 897.0 1501.0 0.77 0.40 0.53 + Site_Bone 252.0 74.0 61.0 313.0 0.77 0.81 0.79 + Site_Liver 178.0 92.0 56.0 234.0 0.66 0.76 0.71 + Site_Lung 398.0 98.0 161.0 559.0 0.80 0.71 0.75 + Site_Brain 197.0 44.0 82.0 279.0 0.82 0.71 0.76 + macro_avg 3020.0 765.0 1548.0 4568.0 0.80 0.71 0.74 + micro_avg NaN NaN NaN NaN 0.80 0.66 0.71 +``` \ No newline at end of file From bceabd64035647b125111b491f867aedd21342da Mon Sep 17 00:00:00 2001 From: mauro-nievoff Date: Wed, 26 Oct 2022 01:42:05 +0700 Subject: [PATCH 13/57] Add model 2022-10-25-ner_oncology_response_to_treatment_en --- ...5-ner_oncology_response_to_treatment_en.md | 148 ++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_response_to_treatment_en.md diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_response_to_treatment_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_response_to_treatment_en.md new file mode 100644 index 00000000000000..8912d175e260ac --- /dev/null +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_response_to_treatment_en.md @@ -0,0 +1,148 @@ +--- +layout: model +title: Extract Mentions of Response to Cancer Treatment +author: John Snow Labs +name: ner_oncology_response_to_treatment +date: 2022-10-25 +tags: [licensed, clinical, oncology, en, ner, treatment] +task: Named Entity Recognition +language: en +edition: Spark NLP for Healthcare 4.0.0 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts entities related to the patient"s response to the oncology treatment, including clinical response and changes in tumor size. + +## Predicted Entities + +`Response_To_Treatment`, `Size_Trend`, `Line_Of_Therapy` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_response_to_treatment_en_4.0.0_3.0_1666722959227.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_response_to_treatment", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["She completed her first-line therapy, but some months later there was recurrence of the breast cancer. "]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_response_to_treatment", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("She completed her first-line therapy, but some months later there was recurrence of the breast cancer. ").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:-----------|:----------------------| +| first-line | Line_Of_Therapy | +| recurrence | Response_To_Treatment | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_response_to_treatment| +|Compatibility:|Spark NLP for Healthcare 4.0.0+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.4 MB| +|Dependencies:|embeddings_clinical| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 +Response_To_Treatment 326.0 101.0 157.0 483.0 0.76 0.67 0.72 + Size_Trend 43.0 28.0 70.0 113.0 0.61 0.38 0.47 + Line_Of_Therapy 99.0 11.0 7.0 106.0 0.90 0.93 0.92 + macro_avg 468.0 140.0 234.0 702.0 0.76 0.66 0.70 + micro_avg NaN NaN NaN NaN 0.76 0.67 0.71 +``` \ No newline at end of file From 7de9179b8d33f84c00150022ff472f3fe005597b Mon Sep 17 00:00:00 2001 From: mauro-nievoff Date: Wed, 26 Oct 2022 01:49:28 +0700 Subject: [PATCH 14/57] Add model 2022-10-25-ner_oncology_biomarker_en --- .../2022-10-25-ner_oncology_biomarker_en.md | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_biomarker_en.md diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_biomarker_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_biomarker_en.md new file mode 100644 index 00000000000000..6f5d41a3d12c34 --- /dev/null +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_biomarker_en.md @@ -0,0 +1,164 @@ +--- +layout: model +title: Extract Biomarkers and their Results +author: John Snow Labs +name: ner_oncology_biomarker +date: 2022-10-25 +tags: [licensed, clinical, oncology, en, ner, biomarker] +task: Named Entity Recognition +language: en +edition: Spark NLP for Healthcare 4.0.0 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts mentions of biomarkers and biomarker results from oncology texts. + +## Predicted Entities + +`Biomarker_Result`, `Biomarker` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_biomarker_en_4.0.0_3.0_1666723339627.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_biomarker", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["The results of immunohistochemical examination showed that she tested negative for CK7, synaptophysin (Syn), chromogranin A (CgA), Muc5AC, human epidermal growth factor receptor-2 (HER2), and Muc6; positive for CK20, Muc1, Muc2, E-cadherin, and p53; the Ki-67 index was about 87%."]]).toDF("text") + +result = pipeline.fit(data).transform(data) + +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_biomarker", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("The results of immunohistochemical examination showed that she tested negative for CK7, synaptophysin (Syn), chromogranin A (CgA), Muc5AC, human epidermal growth factor receptor-2 (HER2), and Muc6; positive for CK20, Muc1, Muc2, E-cadherin, and p53; the Ki-67 index was about 87%.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:-----------------------------------------|:-----------------| +| negative | Biomarker_Result | +| CK7 | Biomarker | +| synaptophysin | Biomarker | +| Syn | Biomarker | +| chromogranin A | Biomarker | +| CgA | Biomarker | +| Muc5AC | Biomarker | +| human epidermal growth factor receptor-2 | Biomarker | +| HER2 | Biomarker | +| Muc6 | Biomarker | +| positive | Biomarker_Result | +| CK20 | Biomarker | +| Muc1 | Biomarker | +| Muc2 | Biomarker | +| E-cadherin | Biomarker | +| p53 | Biomarker | +| Ki-67 index | Biomarker | +| 87% | Biomarker_Result | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_biomarker| +|Compatibility:|Spark NLP for Healthcare 4.0.0+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.3 MB| +|Dependencies:|embeddings_clinical| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 +Biomarker_Result 1030.0 148.0 415.0 1445.0 0.87 0.71 0.79 + Biomarker 1685.0 272.0 279.0 1964.0 0.86 0.86 0.86 + macro_avg 2715.0 420.0 694.0 3409.0 0.87 0.79 0.82 + micro_avg NaN NaN NaN NaN 0.87 0.80 0.83 +``` \ No newline at end of file From 14876520046dd70f0e9ee42818ba67addf55e2bd Mon Sep 17 00:00:00 2001 From: mauro-nievoff Date: Wed, 26 Oct 2022 03:17:04 +0700 Subject: [PATCH 15/57] Add model 2022-10-25-ner_oncology_posology_en --- .../2022-10-25-ner_oncology_posology_en.md | 161 ++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_posology_en.md diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_posology_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_posology_en.md new file mode 100644 index 00000000000000..40ffc0d9ecde1b --- /dev/null +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_posology_en.md @@ -0,0 +1,161 @@ +--- +layout: model +title: Extract Cancer Therapies and Granular Posology Information +author: John Snow Labs +name: ner_oncology_posology +date: 2022-10-25 +tags: [licensed, clinical, oncology, en, ner, treatment, posology] +task: Named Entity Recognition +language: en +edition: Spark NLP for Healthcare 4.0.0 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts cancer therapies (Cancer_Surgery, Radiotherapy and Cancer_Therapy) and posology information at a granular level. + +## Predicted Entities + +`Cycle_Number`, `Cycle_Count`, `Radiotherapy`, `Cancer_Surgery`, `Cycle_Day`, `Frequency`, `Route`, `Cancer_Therapy`, `Duration`, `Dosage`, `Radiation_Dose` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_posology_en_4.0.0_3.0_1666728701834.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_posology", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["The patient underwent a regimen consisting of adriamycin (60 mg/m2) and cyclophosphamide (600 mg/m2) over six courses. She is currently receiving his second cycle of chemotherapy and is in good overall condition."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_posology", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("The patient underwent a regimen consisting of adriamycin (60 mg/m2) and cyclophosphamide (600 mg/m2) over six courses. She is currently receiving his second cycle of chemotherapy and is in good overall condition.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:-----------------|:---------------| +| adriamycin | Cancer_Therapy | +| 60 mg/m2 | Dosage | +| cyclophosphamide | Cancer_Therapy | +| 600 mg/m2 | Dosage | +| six courses | Cycle_Count | +| second cycle | Cycle_Number | +| chemotherapy | Cancer_Therapy | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_posology| +|Compatibility:|Spark NLP for Healthcare 4.0.0+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.3 MB| +|Dependencies:|embeddings_clinical| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + Cycle_Number 52.0 4.0 45.0 97.0 0.93 0.54 0.68 + Cycle_Count 200.0 63.0 30.0 230.0 0.76 0.87 0.81 + Radiotherapy 255.0 16.0 55.0 310.0 0.94 0.82 0.88 +Cancer_Surgery 592.0 66.0 227.0 819.0 0.90 0.72 0.80 + Cycle_Day 175.0 22.0 73.0 248.0 0.89 0.71 0.79 + Frequency 337.0 44.0 90.0 427.0 0.88 0.79 0.83 + Route 53.0 1.0 60.0 113.0 0.98 0.47 0.63 +Cancer_Therapy 1448.0 81.0 250.0 1698.0 0.95 0.85 0.90 + Duration 525.0 154.0 236.0 761.0 0.77 0.69 0.73 + Dosage 858.0 79.0 202.0 1060.0 0.92 0.81 0.86 +Radiation_Dose 86.0 4.0 40.0 126.0 0.96 0.68 0.80 + macro_avg 4581.0 534.0 1308.0 5889.0 0.90 0.72 0.79 + micro_avg NaN NaN NaN NaN 0.90 0.78 0.83 +``` \ No newline at end of file From 25c01eaf5330c1acd6cabfd61c91c386a934f40c Mon Sep 17 00:00:00 2001 From: Cabir C <64752006+Cabir40@users.noreply.github.com> Date: Wed, 26 Oct 2022 14:37:00 +0300 Subject: [PATCH 16/57] updated bancmark --- .../2022-10-25-ner_oncology_anatomy_general_en.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_anatomy_general_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_anatomy_general_en.md index 12a24624d94560..c5f68cf32b7f96 100644 --- a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_anatomy_general_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_anatomy_general_en.md @@ -143,9 +143,9 @@ In-house annotated oncology case reports. ## Benchmarking ```bash - label tp fp fn total precision recall f1 -Anatomical_Site 2946.0 549.0 638.0 3584.0 0.84 0.82 0.83 - Direction 864.0 209.0 120.0 984.0 0.81 0.88 0.84 - macro_avg 3810.0 758.0 758.0 4568.0 0.82 0.85 0.84 - micro_avg NaN NaN NaN NaN 0.83 0.83 0.83 -``` \ No newline at end of file + label tp fp fn total precision recall f1 +Anatomical_Site 2946 549 638 3584 0.84 0.82 0.83 + Direction 864 209 120 984 0.81 0.88 0.84 + macro_avg 3810 758 758 4568 0.82 0.85 0.84 + micro_avg 3810 758 758 4568 0.83 0.83 0.83 +``` From 4b8785b5bdba61436e2b323079311e7e39f0d5c0 Mon Sep 17 00:00:00 2001 From: mauro-nievoff <55700369+mauro-nievoff@users.noreply.github.com> Date: Wed, 26 Oct 2022 10:44:40 -0300 Subject: [PATCH 17/57] Benchmark format updating --- .../2022-10-25-ner_oncology_en.md | 106 +++++++++--------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_en.md index 9af839414e4a68..34110c7da708f1 100644 --- a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_en.md @@ -166,56 +166,56 @@ In-house annotated oncology case reports. ## Benchmarking ```bash - label tp fp fn total precision recall f1 - Histological_Type 339.0 75.0 114.0 453.0 0.82 0.75 0.78 - Direction 832.0 163.0 152.0 984.0 0.84 0.85 0.84 - Staging 229.0 31.0 29.0 258.0 0.88 0.89 0.88 - Cancer_Score 37.0 8.0 25.0 62.0 0.82 0.60 0.69 - Imaging_Test 2027.0 214.0 177.0 2204.0 0.90 0.92 0.91 - Cycle_Number 73.0 29.0 24.0 97.0 0.72 0.75 0.73 - Tumor_Finding 1114.0 64.0 143.0 1257.0 0.95 0.89 0.91 - Site_Lymph_Node 491.0 53.0 60.0 551.0 0.90 0.89 0.90 - Invasion 158.0 36.0 23.0 181.0 0.81 0.87 0.84 -Response_To_Treatment 431.0 149.0 165.0 596.0 0.74 0.72 0.73 - Smoking_Status 66.0 18.0 2.0 68.0 0.79 0.97 0.87 - Tumor_Size 1050.0 112.0 79.0 1129.0 0.90 0.93 0.92 - Cycle_Count 177.0 62.0 53.0 230.0 0.74 0.77 0.75 - Adenopathy 67.0 12.0 29.0 96.0 0.85 0.70 0.77 - Age 930.0 33.0 19.0 949.0 0.97 0.98 0.97 - Biomarker_Result 1160.0 169.0 285.0 1445.0 0.87 0.80 0.84 - Unspecific_Therapy 198.0 86.0 80.0 278.0 0.70 0.71 0.70 - Site_Breast 125.0 15.0 22.0 147.0 0.89 0.85 0.87 - Chemotherapy 814.0 55.0 65.0 879.0 0.94 0.93 0.93 - Targeted_Therapy 195.0 27.0 33.0 228.0 0.88 0.86 0.87 - Radiotherapy 276.0 29.0 34.0 310.0 0.90 0.89 0.90 - Performance_Status 121.0 17.0 14.0 135.0 0.88 0.90 0.89 - Pathology_Test 888.0 296.0 162.0 1050.0 0.75 0.85 0.79 - Site_Other_Body_Part 909.0 275.0 592.0 1501.0 0.77 0.61 0.68 - Cancer_Surgery 693.0 119.0 126.0 819.0 0.85 0.85 0.85 - Line_Of_Therapy 101.0 11.0 5.0 106.0 0.90 0.95 0.93 - Pathology_Result 655.0 279.0 487.0 1142.0 0.70 0.57 0.63 - Hormonal_Therapy 169.0 4.0 16.0 185.0 0.98 0.91 0.94 - Site_Bone 264.0 81.0 49.0 313.0 0.77 0.84 0.80 - Biomarker 1259.0 238.0 256.0 1515.0 0.84 0.83 0.84 - Immunotherapy 103.0 47.0 25.0 128.0 0.69 0.80 0.74 - Cycle_Day 200.0 36.0 48.0 248.0 0.85 0.81 0.83 - Frequency 354.0 27.0 73.0 427.0 0.93 0.83 0.88 - Route 91.0 15.0 22.0 113.0 0.86 0.81 0.83 - Duration 625.0 161.0 136.0 761.0 0.80 0.82 0.81 - Death_Entity 34.0 2.0 4.0 38.0 0.94 0.89 0.92 - Metastasis 353.0 18.0 17.0 370.0 0.95 0.95 0.95 - Site_Liver 189.0 64.0 45.0 234.0 0.75 0.81 0.78 - Cancer_Dx 1301.0 103.0 93.0 1394.0 0.93 0.93 0.93 - Grade 190.0 27.0 46.0 236.0 0.88 0.81 0.84 - Date 807.0 21.0 24.0 831.0 0.97 0.97 0.97 - Site_Lung 469.0 110.0 90.0 559.0 0.81 0.84 0.82 - Site_Brain 221.0 64.0 58.0 279.0 0.78 0.79 0.78 - Relative_Date 1211.0 401.0 111.0 1322.0 0.75 0.92 0.83 - Race_Ethnicity 57.0 8.0 5.0 62.0 0.88 0.92 0.90 - Gender 1247.0 17.0 7.0 1254.0 0.99 0.99 0.99 - Oncogene 345.0 83.0 104.0 449.0 0.81 0.77 0.79 - Dosage 900.0 30.0 160.0 1060.0 0.97 0.85 0.90 - Radiation_Dose 108.0 5.0 18.0 126.0 0.96 0.86 0.90 - macro_avg 24653.0 3999.0 4406.0 29059.0 0.85 0.84 0.84 - micro_avg NaN NaN NaN NaN 0.86 0.85 0.85 -``` \ No newline at end of file + label tp fp fn total precision recall f1 + Histological_Type 339 75 114 453 0.82 0.75 0.78 + Direction 832 163 152 984 0.84 0.85 0.84 + Staging 229 31 29 258 0.88 0.89 0.88 + Cancer_Score 37 8 25 62 0.82 0.60 0.69 + Imaging_Test 2027 214 177 2204 0.90 0.92 0.91 + Cycle_Number 73 29 24 97 0.72 0.75 0.73 + Tumor_Finding 1114 64 143 1257 0.95 0.89 0.91 + Site_Lymph_Node 491 53 60 551 0.90 0.89 0.90 + Invasion 158 36 23 181 0.81 0.87 0.84 +Response_To_Treatment 431 149 165 596 0.74 0.72 0.73 + Smoking_Status 66 18 2 68 0.79 0.97 0.87 + Tumor_Size 1050 112 79 1129 0.90 0.93 0.92 + Cycle_Count 177 62 53 230 0.74 0.77 0.75 + Adenopathy 67 12 29 96 0.85 0.70 0.77 + Age 930 33 19 949 0.97 0.98 0.97 + Biomarker_Result 1160 169 285 1445 0.87 0.80 0.84 + Unspecific_Therapy 198 86 80 278 0.70 0.71 0.70 + Site_Breast 125 15 22 147 0.89 0.85 0.87 + Chemotherapy 814 55 65 879 0.94 0.93 0.93 + Targeted_Therapy 195 27 33 228 0.88 0.86 0.87 + Radiotherapy 276 29 34 310 0.90 0.89 0.90 + Performance_Status 121 17 14 135 0.88 0.90 0.89 + Pathology_Test 888 296 162 1050 0.75 0.85 0.79 + Site_Other_Body_Part 909 275 592 1501 0.77 0.61 0.68 + Cancer_Surgery 693 119 126 819 0.85 0.85 0.85 + Line_Of_Therapy 101 11 5 106 0.90 0.95 0.93 + Pathology_Result 655 279 487 1142 0.70 0.57 0.63 + Hormonal_Therapy 169 4 16 185 0.98 0.91 0.94 + Site_Bone 264 81 49 313 0.77 0.84 0.80 + Biomarker 1259 238 256 1515 0.84 0.83 0.84 + Immunotherapy 103 47 25 128 0.69 0.80 0.74 + Cycle_Day 200 36 48 248 0.85 0.81 0.83 + Frequency 354 27 73 427 0.93 0.83 0.88 + Route 91 15 22 113 0.86 0.81 0.83 + Duration 625 161 136 761 0.80 0.82 0.81 + Death_Entity 34 2 4 38 0.94 0.89 0.92 + Metastasis 353 18 17 370 0.95 0.95 0.95 + Site_Liver 189 64 45 234 0.75 0.81 0.78 + Cancer_Dx 1301 103 93 1394 0.93 0.93 0.93 + Grade 190 27 46 236 0.88 0.81 0.84 + Date 807 21 24 831 0.97 0.97 0.97 + Site_Lung 469 110 90 559 0.81 0.84 0.82 + Site_Brain 221 64 58 279 0.78 0.79 0.78 + Relative_Date 1211 401 111 1322 0.75 0.92 0.83 + Race_Ethnicity 57 8 5 62 0.88 0.92 0.90 + Gender 1247 17 7 1254 0.99 0.99 0.99 + Oncogene 345 83 104 449 0.81 0.77 0.79 + Dosage 900 30 160 1060 0.97 0.85 0.90 + Radiation_Dose 108 5 18 126 0.96 0.86 0.90 + macro_avg 24653 3999 4406 29059 0.85 0.84 0.84 + micro_avg 24653 3999 4406 29059 0.86 0.85 0.85 +``` From 5740c9e3ea7bf6239b3668498fd21e8cebac0bf1 Mon Sep 17 00:00:00 2001 From: mauro-nievoff <55700369+mauro-nievoff@users.noreply.github.com> Date: Wed, 26 Oct 2022 10:47:16 -0300 Subject: [PATCH 18/57] Benchmark format updating --- .../2022-10-25-ner_oncology_therapy_en.md | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_therapy_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_therapy_en.md index 15b311fa773fb3..51dde8aca7eeba 100644 --- a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_therapy_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_therapy_en.md @@ -152,24 +152,24 @@ In-house annotated oncology case reports. ## Benchmarking ```bash - label tp fp fn total precision recall f1 - Cycle_Number 78.0 41.0 19.0 97.0 0.66 0.80 0.72 -Response_To_Treatment 451.0 205.0 145.0 596.0 0.69 0.76 0.72 - Cycle_Count 210.0 75.0 20.0 230.0 0.74 0.91 0.82 - Unspecific_Therapy 189.0 76.0 89.0 278.0 0.71 0.68 0.70 - Chemotherapy 831.0 87.0 48.0 879.0 0.91 0.95 0.92 - Targeted_Therapy 194.0 28.0 34.0 228.0 0.87 0.85 0.86 - Radiotherapy 279.0 35.0 31.0 310.0 0.89 0.90 0.89 - Cancer_Surgery 720.0 192.0 99.0 819.0 0.79 0.88 0.83 - Line_Of_Therapy 95.0 6.0 11.0 106.0 0.94 0.90 0.92 - Hormonal_Therapy 170.0 6.0 15.0 185.0 0.97 0.92 0.94 - Immunotherapy 96.0 17.0 32.0 128.0 0.85 0.75 0.80 - Cycle_Day 205.0 38.0 43.0 248.0 0.84 0.83 0.84 - Frequency 363.0 33.0 64.0 427.0 0.92 0.85 0.88 - Route 93.0 6.0 20.0 113.0 0.94 0.82 0.88 - Duration 527.0 102.0 234.0 761.0 0.84 0.69 0.76 - Dosage 959.0 63.0 101.0 1060.0 0.94 0.90 0.92 - Radiation_Dose 106.0 12.0 20.0 126.0 0.90 0.84 0.87 - macro_avg 5566.0 1022.0 1025.0 6591.0 0.85 0.84 0.84 - micro_avg NaN NaN NaN NaN 0.85 0.84 0.84 -``` \ No newline at end of file + label tp fp fn total precision recall f1 + Cycle_Number 78 41 19 97 0.66 0.80 0.72 +Response_To_Treatment 451 205 145 596 0.69 0.76 0.72 + Cycle_Count 210 75 20 230 0.74 0.91 0.82 + Unspecific_Therapy 189 76 89 278 0.71 0.68 0.70 + Chemotherapy 831 87 48 879 0.91 0.95 0.92 + Targeted_Therapy 194 28 34 228 0.87 0.85 0.86 + Radiotherapy 279 35 31 310 0.89 0.90 0.89 + Cancer_Surgery 720 192 99 819 0.79 0.88 0.83 + Line_Of_Therapy 95 6 11 106 0.94 0.90 0.92 + Hormonal_Therapy 170 6 15 185 0.97 0.92 0.94 + Immunotherapy 96 17 32 128 0.85 0.75 0.80 + Cycle_Day 205 38 43 248 0.84 0.83 0.84 + Frequency 363 33 64 427 0.92 0.85 0.88 + Route 93 6 20 113 0.94 0.82 0.88 + Duration 527 102 234 761 0.84 0.69 0.76 + Dosage 959 63 101 1060 0.94 0.90 0.92 + Radiation_Dose 106 12 20 126 0.90 0.84 0.87 + macro_avg 5566 1022 1025 6591 0.85 0.84 0.84 + micro_avg 5566 1022 1025 6591 0.85 0.84 0.84 +``` From dc2aede5e36f86c63a330189c4e96e026f5efb80 Mon Sep 17 00:00:00 2001 From: mauro-nievoff <55700369+mauro-nievoff@users.noreply.github.com> Date: Wed, 26 Oct 2022 10:48:06 -0300 Subject: [PATCH 19/57] Benchmark format updating --- .../2022-10-25-ner_oncology_diagnosis_en.md | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_diagnosis_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_diagnosis_en.md index 82dcc17203a594..6c40382155b417 100644 --- a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_diagnosis_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_diagnosis_en.md @@ -145,19 +145,19 @@ In-house annotated oncology case reports. ## Benchmarking ```bash - label tp fp fn total precision recall f1 - Histological_Type 354.0 63.0 99.0 453.0 0.85 0.78 0.81 - Staging 234.0 27.0 24.0 258.0 0.90 0.91 0.90 - Cancer_Score 36.0 15.0 26.0 62.0 0.71 0.58 0.64 - Tumor_Finding 1121.0 83.0 136.0 1257.0 0.93 0.89 0.91 - Invasion 154.0 27.0 27.0 181.0 0.85 0.85 0.85 - Tumor_Size 1058.0 126.0 71.0 1129.0 0.89 0.94 0.91 - Adenopathy 66.0 10.0 30.0 96.0 0.87 0.69 0.77 -Performance_Status 116.0 15.0 19.0 135.0 0.89 0.86 0.87 - Pathology_Result 852.0 686.0 290.0 1142.0 0.55 0.75 0.64 - Metastasis 356.0 15.0 14.0 370.0 0.96 0.96 0.96 - Cancer_Dx 1302.0 88.0 92.0 1394.0 0.94 0.93 0.94 - Grade 201.0 23.0 35.0 236.0 0.90 0.85 0.87 - macro_avg 5850.0 1178.0 863.0 6713.0 0.85 0.83 0.84 - micro_avg NaN NaN NaN NaN 0.85 0.87 0.86 -``` \ No newline at end of file + label tp fp fn total precision recall f1 + Histological_Type 354 63 99 453 0.85 0.78 0.81 + Staging 234 27 24 258 0.90 0.91 0.90 + Cancer_Score 36 15 26 62 0.71 0.58 0.64 + Tumor_Finding 1121 83 136 1257 0.93 0.89 0.91 + Invasion 154 27 27 181 0.85 0.85 0.85 + Tumor_Size 1058 126 71 1129 0.89 0.94 0.91 + Adenopathy 66 10 30 96 0.87 0.69 0.77 +Performance_Status 116 15 19 135 0.89 0.86 0.87 + Pathology_Result 852 686 290 1142 0.55 0.75 0.64 + Metastasis 356 15 14 370 0.96 0.96 0.96 + Cancer_Dx 1302 88 92 1394 0.94 0.93 0.94 + Grade 201 23 35 236 0.90 0.85 0.87 + macro_avg 5850 1178 863 6713 0.85 0.83 0.84 + micro_avg 5850 1178 863 6713 0.85 0.87 0.86 +``` From 751a7173cea37a5a632377d560f80d9c62360757 Mon Sep 17 00:00:00 2001 From: mauro-nievoff <55700369+mauro-nievoff@users.noreply.github.com> Date: Wed, 26 Oct 2022 10:48:45 -0300 Subject: [PATCH 20/57] Benchmark format updating --- .../2022-10-25-ner_oncology_tnm_en.md | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_tnm_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_tnm_en.md index e9779d71d1b040..6bb7f7a2376577 100644 --- a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_tnm_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_tnm_en.md @@ -143,14 +143,14 @@ In-house annotated oncology case reports. ## Benchmarking ```bash - label tp fp fn total precision recall f1 - Lymph_Node 570.0 77.0 77.0 647.0 0.88 0.88 0.88 - Staging 232.0 22.0 26.0 258.0 0.91 0.90 0.91 -Lymph_Node_Modifier 30.0 5.0 5.0 35.0 0.86 0.86 0.86 - Tumor_Description 2651.0 581.0 490.0 3141.0 0.82 0.84 0.83 - Tumor 1116.0 72.0 141.0 1257.0 0.94 0.89 0.91 - Metastasis 358.0 15.0 12.0 370.0 0.96 0.97 0.96 - Cancer_Dx 1302.0 87.0 92.0 1394.0 0.94 0.93 0.94 - macro_avg 6259.0 859.0 843.0 7102.0 0.90 0.90 0.90 - micro_avg NaN NaN NaN NaN 0.88 0.88 0.88 -``` \ No newline at end of file + label tp fp fn total precision recall f1 + Lymph_Node 570 77 77 647 0.88 0.88 0.88 + Staging 232 22 26 258 0.91 0.90 0.91 +Lymph_Node_Modifier 30 5 5 35 0.86 0.86 0.86 + Tumor_Description 2651 581 490 3141 0.82 0.84 0.83 + Tumor 1116 72 141 1257 0.94 0.89 0.91 + Metastasis 358 15 12 370 0.96 0.97 0.96 + Cancer_Dx 1302 87 92 1394 0.94 0.93 0.94 + macro_avg 6259 859 843 7102 0.90 0.90 0.90 + micro_avg 6259 859 843 7102 0.88 0.88 0.88 +``` From 82d68aafe358949a8fc48f6587d3ef97113d1416 Mon Sep 17 00:00:00 2001 From: mauro-nievoff <55700369+mauro-nievoff@users.noreply.github.com> Date: Wed, 26 Oct 2022 10:50:29 -0300 Subject: [PATCH 21/57] Update 2022-10-25-ner_oncology_anatomy_general_en.md --- .../2022-10-25-ner_oncology_anatomy_general_en.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_anatomy_general_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_anatomy_general_en.md index c5f68cf32b7f96..d818710a1480f9 100644 --- a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_anatomy_general_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_anatomy_general_en.md @@ -143,9 +143,9 @@ In-house annotated oncology case reports. ## Benchmarking ```bash - label tp fp fn total precision recall f1 -Anatomical_Site 2946 549 638 3584 0.84 0.82 0.83 - Direction 864 209 120 984 0.81 0.88 0.84 - macro_avg 3810 758 758 4568 0.82 0.85 0.84 - micro_avg 3810 758 758 4568 0.83 0.83 0.83 + label tp fp fn total precision recall f1 +Anatomical_Site 2946 549 638 3584 0.84 0.82 0.83 + Direction 864 209 120 984 0.81 0.88 0.84 + macro_avg 3810 758 758 4568 0.82 0.85 0.84 + micro_avg 3810 758 758 4568 0.83 0.83 0.83 ``` From 21057d26f4b9d5de40eedb2f822d42666ff11115 Mon Sep 17 00:00:00 2001 From: mauro-nievoff <55700369+mauro-nievoff@users.noreply.github.com> Date: Wed, 26 Oct 2022 10:51:01 -0300 Subject: [PATCH 22/57] Benchmark format updating --- .../2022-10-25-ner_oncology_demographics_en.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_demographics_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_demographics_en.md index cc60b3ecf53588..25f008b778932f 100644 --- a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_demographics_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_demographics_en.md @@ -140,11 +140,11 @@ In-house annotated oncology case reports. ## Benchmarking ```bash - label tp fp fn total precision recall f1 -Smoking_Status 60.0 19.0 8.0 68.0 0.76 0.88 0.82 - Age 934.0 33.0 15.0 949.0 0.97 0.98 0.97 -Race_Ethnicity 57.0 5.0 5.0 62.0 0.92 0.92 0.92 - Gender 1248.0 18.0 6.0 1254.0 0.99 1.00 0.99 - macro_avg 2299.0 75.0 34.0 2333.0 0.91 0.95 0.93 - micro_avg NaN NaN NaN NaN 0.97 0.99 0.98 -``` \ No newline at end of file + label tp fp fn total precision recall f1 +Smoking_Status 60 19 8 68 0.76 0.88 0.82 + Age 934 33 15 949 0.97 0.98 0.97 +Race_Ethnicity 57 5 5 62 0.92 0.92 0.92 + Gender 1248 18 6 1254 0.99 1.00 0.99 + macro_avg 2299 75 34 2333 0.91 0.95 0.93 + micro_avg 2299 75 34 2333 0.97 0.99 0.98 +``` From 6d54bf8020945c57f1b44db5ee6489d34848185c Mon Sep 17 00:00:00 2001 From: mauro-nievoff <55700369+mauro-nievoff@users.noreply.github.com> Date: Wed, 26 Oct 2022 10:51:42 -0300 Subject: [PATCH 23/57] Benchmark format updating --- .../2022-10-25-ner_oncology_test_en.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_test_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_test_en.md index a01865da5ea3c8..3c0d004671db08 100644 --- a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_test_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_test_en.md @@ -141,12 +141,12 @@ In-house annotated oncology case reports. ## Benchmarking ```bash - label tp fp fn total precision recall f1 - Imaging_Test 2020.0 229.0 184.0 2204.0 0.90 0.92 0.91 -Biomarker_Result 1177.0 186.0 268.0 1445.0 0.86 0.81 0.84 - Pathology_Test 888.0 276.0 162.0 1050.0 0.76 0.85 0.80 - Biomarker 1287.0 254.0 228.0 1515.0 0.84 0.85 0.84 - Oncogene 365.0 89.0 84.0 449.0 0.80 0.81 0.81 - macro_avg 5737.0 1034.0 926.0 6663.0 0.83 0.85 0.84 - micro_avg NaN NaN NaN NaN 0.85 0.86 0.85 -``` \ No newline at end of file + label tp fp fn total precision recall f1 + Imaging_Test 2020 229 184 2204 0.90 0.92 0.91 +Biomarker_Result 1177 186 268 1445 0.86 0.81 0.84 + Pathology_Test 888 276 162 1050 0.76 0.85 0.80 + Biomarker 1287 254 228 1515 0.84 0.85 0.84 + Oncogene 365 89 84 449 0.80 0.81 0.81 + macro_avg 5737 1034 926 6663 0.83 0.85 0.84 + micro_avg 5737 1034 926 6663 0.85 0.86 0.85 +``` From cae01bbcacf4bca6a3fb202aedcb1d5ede95e166 Mon Sep 17 00:00:00 2001 From: mauro-nievoff <55700369+mauro-nievoff@users.noreply.github.com> Date: Wed, 26 Oct 2022 10:52:16 -0300 Subject: [PATCH 24/57] Benchmark format updating --- ...2022-10-25-ner_oncology_unspecific_posology_en.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_unspecific_posology_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_unspecific_posology_en.md index 833d8a6a31304b..1a764edb25b88f 100644 --- a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_unspecific_posology_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_unspecific_posology_en.md @@ -146,9 +146,9 @@ In-house annotated oncology case reports. ## Benchmarking ```bash - label tp fp fn total precision recall f1 -Posology_Information 2663.0 244.0 399.0 3062.0 0.92 0.87 0.89 - Cancer_Therapy 2580.0 317.0 247.0 2827.0 0.89 0.91 0.90 - macro_avg 5243.0 561.0 646.0 5889.0 0.90 0.89 0.90 - micro_avg NaN NaN NaN NaN 0.90 0.89 0.90 -``` \ No newline at end of file + label tp fp fn total precision recall f1 +Posology_Information 2663 244 399 3062 0.92 0.87 0.89 + Cancer_Therapy 2580 317 247 2827 0.89 0.91 0.90 + macro_avg 5243 561 646 5889 0.90 0.89 0.90 + micro_avg 5243 561 646 5889 0.90 0.89 0.90 +``` From 7df2fd91fe62890790ead965cc6215fb937e64e9 Mon Sep 17 00:00:00 2001 From: mauro-nievoff <55700369+mauro-nievoff@users.noreply.github.com> Date: Wed, 26 Oct 2022 10:52:59 -0300 Subject: [PATCH 25/57] Benchmark format update --- ...-10-25-ner_oncology_anatomy_granular_en.md | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_anatomy_granular_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_anatomy_granular_en.md index 09ca038910aa19..f3369b9ad71539 100644 --- a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_anatomy_granular_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_anatomy_granular_en.md @@ -141,15 +141,15 @@ In-house annotated oncology case reports. ## Benchmarking ```bash - label tp fp fn total precision recall f1 - Direction 822.0 221.0 162.0 984.0 0.79 0.84 0.81 - Site_Lymph_Node 481.0 38.0 70.0 551.0 0.93 0.87 0.90 - Site_Breast 88.0 14.0 59.0 147.0 0.86 0.60 0.71 -Site_Other_Body_Part 604.0 184.0 897.0 1501.0 0.77 0.40 0.53 - Site_Bone 252.0 74.0 61.0 313.0 0.77 0.81 0.79 - Site_Liver 178.0 92.0 56.0 234.0 0.66 0.76 0.71 - Site_Lung 398.0 98.0 161.0 559.0 0.80 0.71 0.75 - Site_Brain 197.0 44.0 82.0 279.0 0.82 0.71 0.76 - macro_avg 3020.0 765.0 1548.0 4568.0 0.80 0.71 0.74 - micro_avg NaN NaN NaN NaN 0.80 0.66 0.71 -``` \ No newline at end of file + label tp fp fn total precision recall f1 + Direction 822 221 162 984 0.79 0.84 0.81 + Site_Lymph_Node 481 38 70 551 0.93 0.87 0.90 + Site_Breast 88 14 59 147 0.86 0.60 0.71 +Site_Other_Body_Part 604 184 897 1501 0.77 0.40 0.53 + Site_Bone 252 74 61 313 0.77 0.81 0.79 + Site_Liver 178 92 56 234 0.66 0.76 0.71 + Site_Lung 398 98 161 559 0.80 0.71 0.75 + Site_Brain 197 44 82 279 0.82 0.71 0.76 + macro_avg 3020 765 1548 4568 0.80 0.71 0.74 + micro_avg 3020 765 1548 4568 0.80 0.66 0.71 +``` From f675d8dece289fe9601e360d4d215141d211a1a9 Mon Sep 17 00:00:00 2001 From: mauro-nievoff <55700369+mauro-nievoff@users.noreply.github.com> Date: Wed, 26 Oct 2022 10:53:33 -0300 Subject: [PATCH 26/57] Benchmark format update --- ...-10-25-ner_oncology_response_to_treatment_en.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_response_to_treatment_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_response_to_treatment_en.md index 8912d175e260ac..bc76ea5e368a3b 100644 --- a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_response_to_treatment_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_response_to_treatment_en.md @@ -139,10 +139,10 @@ In-house annotated oncology case reports. ## Benchmarking ```bash - label tp fp fn total precision recall f1 -Response_To_Treatment 326.0 101.0 157.0 483.0 0.76 0.67 0.72 - Size_Trend 43.0 28.0 70.0 113.0 0.61 0.38 0.47 - Line_Of_Therapy 99.0 11.0 7.0 106.0 0.90 0.93 0.92 - macro_avg 468.0 140.0 234.0 702.0 0.76 0.66 0.70 - micro_avg NaN NaN NaN NaN 0.76 0.67 0.71 -``` \ No newline at end of file + label tp fp fn total precision recall f1 +Response_To_Treatment 326 101 157 483 0.76 0.67 0.72 + Size_Trend 43 28 70 113 0.61 0.38 0.47 + Line_Of_Therapy 99 11 7 106 0.90 0.93 0.92 + macro_avg 468 140 234 702 0.76 0.66 0.70 + micro_avg 468 140 234 702 0.76 0.67 0.71 +``` From 87390919af4a4c818bd47d06fbbc01c9dbe6a4c4 Mon Sep 17 00:00:00 2001 From: mauro-nievoff <55700369+mauro-nievoff@users.noreply.github.com> Date: Wed, 26 Oct 2022 10:54:02 -0300 Subject: [PATCH 27/57] Benchmark format update --- .../2022-10-25-ner_oncology_biomarker_en.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_biomarker_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_biomarker_en.md index 6f5d41a3d12c34..bd2cd824ca0ad4 100644 --- a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_biomarker_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_biomarker_en.md @@ -156,9 +156,9 @@ In-house annotated oncology case reports. ## Benchmarking ```bash - label tp fp fn total precision recall f1 -Biomarker_Result 1030.0 148.0 415.0 1445.0 0.87 0.71 0.79 - Biomarker 1685.0 272.0 279.0 1964.0 0.86 0.86 0.86 - macro_avg 2715.0 420.0 694.0 3409.0 0.87 0.79 0.82 - micro_avg NaN NaN NaN NaN 0.87 0.80 0.83 -``` \ No newline at end of file + label tp fp fn total precision recall f1 +Biomarker_Result 1030 148 415 1445 0.87 0.71 0.79 + Biomarker 1685 272 279 1964 0.86 0.86 0.86 + macro_avg 2715 420 694 3409 0.87 0.79 0.82 + micro_avg 2715 420 694 3409 0.87 0.80 0.83 +``` From 159075232ed07850b3c152b24be6df71e2f2b1de Mon Sep 17 00:00:00 2001 From: mauro-nievoff <55700369+mauro-nievoff@users.noreply.github.com> Date: Wed, 26 Oct 2022 10:54:31 -0300 Subject: [PATCH 28/57] Benchmark format update --- .../2022-10-25-ner_oncology_posology_en.md | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_posology_en.md b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_posology_en.md index 40ffc0d9ecde1b..2aedd79fcf878b 100644 --- a/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_posology_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-25-ner_oncology_posology_en.md @@ -144,18 +144,18 @@ In-house annotated oncology case reports. ## Benchmarking ```bash - label tp fp fn total precision recall f1 - Cycle_Number 52.0 4.0 45.0 97.0 0.93 0.54 0.68 - Cycle_Count 200.0 63.0 30.0 230.0 0.76 0.87 0.81 - Radiotherapy 255.0 16.0 55.0 310.0 0.94 0.82 0.88 -Cancer_Surgery 592.0 66.0 227.0 819.0 0.90 0.72 0.80 - Cycle_Day 175.0 22.0 73.0 248.0 0.89 0.71 0.79 - Frequency 337.0 44.0 90.0 427.0 0.88 0.79 0.83 - Route 53.0 1.0 60.0 113.0 0.98 0.47 0.63 -Cancer_Therapy 1448.0 81.0 250.0 1698.0 0.95 0.85 0.90 - Duration 525.0 154.0 236.0 761.0 0.77 0.69 0.73 - Dosage 858.0 79.0 202.0 1060.0 0.92 0.81 0.86 -Radiation_Dose 86.0 4.0 40.0 126.0 0.96 0.68 0.80 - macro_avg 4581.0 534.0 1308.0 5889.0 0.90 0.72 0.79 - micro_avg NaN NaN NaN NaN 0.90 0.78 0.83 -``` \ No newline at end of file + label tp fp fn total precision recall f1 + Cycle_Number 52 4 45 97 0.93 0.54 0.68 + Cycle_Count 200 63 30 230 0.76 0.87 0.81 + Radiotherapy 255 16 55 310 0.94 0.82 0.88 +Cancer_Surgery 592 66 227 819 0.90 0.72 0.80 + Cycle_Day 175 22 73 248 0.89 0.71 0.79 + Frequency 337 44 90 427 0.88 0.79 0.83 + Route 53 1 60 113 0.98 0.47 0.63 +Cancer_Therapy 1448 81 250 1698 0.95 0.85 0.90 + Duration 525 154 236 761 0.77 0.69 0.73 + Dosage 858 79 202 1060 0.92 0.81 0.86 +Radiation_Dose 86 4 40 126 0.96 0.68 0.80 + macro_avg 4581 534 1308 5889 0.90 0.72 0.79 + micro_avg 4581 534 1308 5889 0.90 0.78 0.83 +``` From 27c0cb3251e8d24578f0bb73769ad7548f258658 Mon Sep 17 00:00:00 2001 From: Ahmetemintek Date: Fri, 28 Oct 2022 21:33:13 +0700 Subject: [PATCH 29/57] Add model 2022-10-28-sbiobertresolve_icd10pcs_augmented_en --- ...8-sbiobertresolve_icd10pcs_augmented_en.md | 173 ++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 docs/_posts/Ahmetemintek/2022-10-28-sbiobertresolve_icd10pcs_augmented_en.md diff --git a/docs/_posts/Ahmetemintek/2022-10-28-sbiobertresolve_icd10pcs_augmented_en.md b/docs/_posts/Ahmetemintek/2022-10-28-sbiobertresolve_icd10pcs_augmented_en.md new file mode 100644 index 00000000000000..7312afe624390b --- /dev/null +++ b/docs/_posts/Ahmetemintek/2022-10-28-sbiobertresolve_icd10pcs_augmented_en.md @@ -0,0 +1,173 @@ +--- +layout: model +title: Sentence Entity Resolver for ICD-10-PCS (Augmented) +author: John Snow Labs +name: sbiobertresolve_icd10pcs_augmented +date: 2022-10-28 +tags: [entity_resolution, clinical, en, licensed] +task: Entity Resolution +language: en +edition: Spark NLP for Healthcare 4.2.1 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model maps extracted medical entities to ICD10-PCS codes using `sbiobert_base_cased_mli` Sentence Bert Embeddings. It trained on the augmented version of the dataset which is used in previous ICD-10-PCS resolver model. + +## Predicted Entities + +`ICD-10-PCS Codes` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/3.Clinical_Entity_Resolvers.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/sbiobertresolve_icd10pcs_augmented_en_4.2.1_3.0_1666966980428.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + + +word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence","token"])\ + .setOutputCol("embeddings") + + +clinical_ner = MedicalNerModel.pretrained("ner_jsl", "en", "clinical/models")\ + .setInputCols(["sentence","token","embeddings"])\ + .setOutputCol("ner") + +ner_converter = NerConverter()\ + .setInputCols(["sentence","token","ner"])\ + .setOutputCol("ner_chunk")\ + .setWhiteList(['Procedure', 'Test', 'Test_Result', 'Treatment', 'Pulse', 'Imaging_Technique', 'Labour_Delivery', 'Blood_Pressure', 'Oxygen_Therapy', 'Weight', 'LDL', 'O2_Saturation', 'BMI', 'Vaccine', 'Respiration', 'Temperature', 'Birth_Entity', 'Triglycerides', 'Puerperium']) + + +chunk2doc = Chunk2Doc().setInputCols("ner_chunk").setOutputCol("ner_chunk_doc") + +sbert_embedder = BertSentenceEmbeddings\ + .pretrained("sbiobert_base_cased_mli","en","clinical/models")\ + .setInputCols(["ner_chunk_doc"])\ + .setOutputCol("sbert_embeddings") + +icd10pcs_resolver = SentenceEntityResolverModel\ + .pretrained("sbiobertresolve_icd10pcs_augmented","en", "clinical/models") \ + .setInputCols(["ner_chunk", "sbert_embeddings"]) \ + .setOutputCol("resolution")\ + .setDistanceFunction("EUCLIDEAN") + +nlpPipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, word_embeddings, clinical_ner, ner_converter, chunk2doc, sbert_embedder, icd10pcs_resolver]) + + +text = [["""Given the severity of her abdominal examination and her persistence of her symptoms, it is detected that need for laparoscopic appendectomy and possible open appendectomy as well as pyeloplasty. We recommend performing a mediastinoscopy"""]] + + +data= spark.createDataFrame(text).toDF('text') +results = nlpPipeline.fit(data).transform(data) + +``` +```scala +val document_assembler = DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ +.setInputCols(Array("document")) +.setOutputCol("sentence") + +val tokenizer = Tokenizer() +.setInputCols(Array("sentence")) +.setOutputCol("token") + + +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") +.setInputCols(Array("sentence","token")) +.setOutputCol("embeddings") + + +val clinical_ner = MedicalNerModel.pretrained("ner_jsl", "en", "clinical/models") +.setInputCols(Array("sentence","token","embeddings")) +.setOutputCol("ner") + +val ner_converter = NerConverter() +.setInputCols(Array("sentence","token","ner")) +.setOutputCol("ner_chunk") +.setWhiteList(Array("Procedure", "Test", "Test_Result", "Treatment", "Pulse", "Imaging_Technique", "Labour_Delivery", "Blood_Pressure", "Oxygen_Therapy", "Weight", "LDL", "O2_Saturation", "BMI", "Vaccine", "Respiration", "Temperature", "Birth_Entity", "Triglycerides", "Puerperium")) + + +val chunk2doc = Chunk2Doc().setInputCols("ner_chunk").setOutputCol("ner_chunk_doc") + +val sbert_embedder = BertSentenceEmbeddings +.pretrained("sbiobert_base_cased_mli","en","clinical/models") +.setInputCols(Array("ner_chunk_doc")) +.setOutputCol("sbert_embeddings") + +val icd10pcs_resolver = SentenceEntityResolverModel +.pretrained("sbiobertresolve_icd10pcs_augmented","en", "clinical/models") +.setInputCols(Array("ner_chunk", "sbert_embeddings")) +.setOutputCol("resolution") +.setDistanceFunction("EUCLIDEAN") + +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, word_embeddings, clinical_ner, ner_converter, chunk2doc, sbert_embedder, icd10pcs_resolver)) + +val data = Seq("Given the severity of her abdominal examination and her persistence of her symptoms, it is detected that need for laparoscopic appendectomy and possible open appendectomy as well as pyeloplasty. We recommend performing a mediastinoscopy").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++-------------------------+---------+-------------+------------------------------------------------------------+------------------------------------------------------------+ +| ner_chunk| entity|icd10pcs_code| resolutions| all_codes| ++-------------------------+---------+-------------+------------------------------------------------------------+------------------------------------------------------------+ +| abdominal examination| Test| 2W63XZZ|[traction of abdominal wall [traction of abdominal wall],...|[2W63XZZ, BW40ZZZ, DWY37ZZ, 0WJFXZZ, 2W03X2Z, 0WJF4ZZ, 0W...| +|laparoscopic appendectomy|Procedure| 0DTJ8ZZ|[resection of appendix, endo [resection of appendix, endo...|[0DTJ8ZZ, 0DT84ZZ, 0DTJ4ZZ, 0WBH4ZZ, 0DTR4ZZ, 0DBJ8ZZ, 0D...| +| open appendectomy|Procedure| 0DBJ0ZZ|[excision of appendix, open approach [excision of appendi...|[0DBJ0ZZ, 0DTJ0ZZ, 0DBA0ZZ, 0D5J0ZZ, 0DB80ZZ, 0DB90ZZ, 04...| +| pyeloplasty|Procedure| 0TS84ZZ|[reposition bilateral ureters, perc endo approach [reposi...|[0TS84ZZ, 0TS74ZZ, 069B3ZZ, 06SB3ZZ, 0TR74JZ, 0TQ43ZZ, 04...| +| mediastinoscopy|Procedure| BB1CZZZ|[fluoroscopy of mediastinum [fluoroscopy of mediastinum],...|[BB1CZZZ, 0WJC4ZZ, BB4CZZZ, 0WJC3ZZ, 0WHC33Z, 0WHC43Z, 0W...| ++-------------------------+---------+-------------+------------------------------------------------------------+------------------------------------------------------------+ + +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|sbiobertresolve_icd10pcs_augmented| +|Compatibility:|Spark NLP for Healthcare 4.2.1+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence_embeddings]| +|Output Labels:|[icd10pcs_code]| +|Language:|en| +|Size:|649.1 MB| +|Case sensitive:|false| + +## References + +Trained on ICD-10 Procedure Coding System dataset with sbiobert_base_cased_mli sentence embeddings. https://www.icd10data.com/ICD10PCS/Codes \ No newline at end of file From 5b0e2f3fb47a96616822a70c977fffccbdb64a0a Mon Sep 17 00:00:00 2001 From: Ahmetemintek Date: Fri, 28 Oct 2022 16:02:09 +0100 Subject: [PATCH 30/57] Update 2022-10-28-sbiobertresolve_icd10pcs_augmented_en.md --- .../2022-10-28-sbiobertresolve_icd10pcs_augmented_en.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/_posts/Ahmetemintek/2022-10-28-sbiobertresolve_icd10pcs_augmented_en.md b/docs/_posts/Ahmetemintek/2022-10-28-sbiobertresolve_icd10pcs_augmented_en.md index 7312afe624390b..688fc2b667fe97 100644 --- a/docs/_posts/Ahmetemintek/2022-10-28-sbiobertresolve_icd10pcs_augmented_en.md +++ b/docs/_posts/Ahmetemintek/2022-10-28-sbiobertresolve_icd10pcs_augmented_en.md @@ -61,7 +61,7 @@ clinical_ner = MedicalNerModel.pretrained("ner_jsl", "en", "clinical/models")\ ner_converter = NerConverter()\ .setInputCols(["sentence","token","ner"])\ .setOutputCol("ner_chunk")\ - .setWhiteList(['Procedure', 'Test', 'Test_Result', 'Treatment', 'Pulse', 'Imaging_Technique', 'Labour_Delivery', 'Blood_Pressure', 'Oxygen_Therapy', 'Weight', 'LDL', 'O2_Saturation', 'BMI', 'Vaccine', 'Respiration', 'Temperature', 'Birth_Entity', 'Triglycerides', 'Puerperium']) + .setWhiteList(['Procedure', 'Test', 'Test_Result', 'Treatment', 'Pulse', 'Imaging_Technique', 'Labour_Delivery', 'Blood_Pressure', 'Oxygen_Therapy', 'Weight', 'LDL', 'O2_Saturation', 'BMI', 'Vaccine', 'Respiration', 'Temperature', 'Birth_Entity', 'Triglycerides']) chunk2doc = Chunk2Doc().setInputCols("ner_chunk").setOutputCol("ner_chunk_doc") @@ -114,7 +114,7 @@ val clinical_ner = MedicalNerModel.pretrained("ner_jsl", "en", "clinical/models" val ner_converter = NerConverter() .setInputCols(Array("sentence","token","ner")) .setOutputCol("ner_chunk") -.setWhiteList(Array("Procedure", "Test", "Test_Result", "Treatment", "Pulse", "Imaging_Technique", "Labour_Delivery", "Blood_Pressure", "Oxygen_Therapy", "Weight", "LDL", "O2_Saturation", "BMI", "Vaccine", "Respiration", "Temperature", "Birth_Entity", "Triglycerides", "Puerperium")) +.setWhiteList(Array("Procedure", "Test", "Test_Result", "Treatment", "Pulse", "Imaging_Technique", "Labour_Delivery", "Blood_Pressure", "Oxygen_Therapy", "Weight", "LDL", "O2_Saturation", "BMI", "Vaccine", "Respiration", "Temperature", "Birth_Entity", "Triglycerides")) val chunk2doc = Chunk2Doc().setInputCols("ner_chunk").setOutputCol("ner_chunk_doc") @@ -170,4 +170,4 @@ val result = pipeline.fit(data).transform(data) ## References -Trained on ICD-10 Procedure Coding System dataset with sbiobert_base_cased_mli sentence embeddings. https://www.icd10data.com/ICD10PCS/Codes \ No newline at end of file +Trained on ICD-10 Procedure Coding System dataset with sbiobert_base_cased_mli sentence embeddings. https://www.icd10data.com/ICD10PCS/Codes From 7ad9cda084898fa701cd6b79c0b014df5fa9018d Mon Sep 17 00:00:00 2001 From: Ahmetemintek Date: Sun, 30 Oct 2022 05:34:59 +0700 Subject: [PATCH 31/57] Add model 2022-10-29-icd10cm_mapper_en --- .../2022-10-29-icd10cm_mapper_en.md | 161 ++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 docs/_posts/Ahmetemintek/2022-10-29-icd10cm_mapper_en.md diff --git a/docs/_posts/Ahmetemintek/2022-10-29-icd10cm_mapper_en.md b/docs/_posts/Ahmetemintek/2022-10-29-icd10cm_mapper_en.md new file mode 100644 index 00000000000000..7742bbe8a44104 --- /dev/null +++ b/docs/_posts/Ahmetemintek/2022-10-29-icd10cm_mapper_en.md @@ -0,0 +1,161 @@ +--- +layout: model +title: Mapping Entities with Corresponding ICD-10-CM Codes +author: John Snow Labs +name: icd10cm_mapper +date: 2022-10-29 +tags: [icd10cm, chunk_mapper, clinical, licensed, en] +task: Chunk Mapping +language: en +edition: Spark NLP for Healthcare 4.2.1 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This pretrained model maps entities with their corresponding ICD-10-CM codes. + +## Predicted Entities + +`icd10cm_code` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/26.Chunk_Mapping.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/icd10cm_mapper_en_4.2.1_3.0_1667082016627.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol('text')\ + .setOutputCol('document') + +sentence_detector = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols("sentence")\ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel\ + .pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") + +ner_model = MedicalNerModel\ + .pretrained("ner_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token", "embeddings"])\ + .setOutputCol("ner") + +ner_converter = NerConverterInternal()\ + .setInputCols("sentence", "token", "ner")\ + .setOutputCol("ner_chunk") + +chunkerMapper = ChunkMapperModel\ + .pretrained("icd10cm_mapper", "en", "clinical/models")\ + .setInputCols(["ner_chunk"])\ + .setOutputCol("mappings")\ + .setRels(["icd10cm_code"]) + +mapper_pipeline = Pipeline().setStages([ + document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner_model, + ner_converter, + chunkerMapper]) + + +test_data = spark.createDataFrame([["A 35-year-old male with a history of primary leiomyosarcoma of neck, gestational diabetes mellitus diagnosed eight years prior to presentation and presented with a one-week history of polydipsia, poor appetite, and vomiting."]]).toDF("text") + +mapper_model = mapper_pipeline.fit(test_data) + +result= mapper_model.transform(test_data) +``` +```scala +val document_assembler = new DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +val sentence_detector = new SentenceDetector()\ + .setInputCols(Array("document"))\ + .setOutputCol("sentence") + +val tokenizer = new Tokenizer()\ + .setInputCols("sentence")\ + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel + .pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(Array("sentence", "token"))\ + .setOutputCol("embeddings") + +val ner_model = MedicalNerModel + .pretrained("ner_clinical", "en", "clinical/models")\ + .setInputCols(Array("sentence", "token", "embeddings"))\ + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal()\ + .setInputCols("sentence", "token", "ner")\ + .setOutputCol("ner_chunk") + +val chunkerMapper = ChunkMapperModel + .pretrained("icd10cm_mapper", "en", "clinical/models")\ + .setInputCols(Array("ner_chunk"))\ + .setOutputCol("mappings")\ + .setRels(Array("icd10cm_code")) + +val mapper_pipeline = new Pipeline().setStages(Array( + document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner_model, + ner_converter, + chunkerMapper)) + + +val data = Seq("A 35-year-old male with a history of primary leiomyosarcoma of neck, gestational diabetes mellitus diagnosed eight years prior to presentation and presented with a one-week history of polydipsia, poor appetite, and vomiting.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++------------------------------+-------+------------+ +|ner_chunk |entity |icd10cm_code| ++------------------------------+-------+------------+ +|primary leiomyosarcoma of neck|PROBLEM|C49.0 | +|gestational diabetes mellitus |PROBLEM|O24.919 | +|polydipsia |PROBLEM|R63.1 | +|poor appetite |PROBLEM|R63.0 | +|vomiting |PROBLEM|R11.10 | ++------------------------------+-------+------------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|icd10cm_mapper| +|Compatibility:|Spark NLP for Healthcare 4.2.1+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[ner_chunk]| +|Output Labels:|[mappings]| +|Language:|en| +|Size:|6.2 MB| \ No newline at end of file From 4964ecfb3fb73c6aff724091e8023be154d2b38b Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Wed, 2 Nov 2022 18:41:49 +0700 Subject: [PATCH 32/57] 2022-10-30-abbreviation_mapper_augmented_en (#13005) * Add model 2022-10-30-abbreviation_mapper_augmented_en * Add model 2022-11-02-icd10cm_resolver_pipeline_en * Delete 2022-11-02-icd10cm_resolver_pipeline_en.md Co-authored-by: Ahmetemintek --- ...-10-30-abbreviation_mapper_augmented_en.md | 163 ++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 docs/_posts/Ahmetemintek/2022-10-30-abbreviation_mapper_augmented_en.md diff --git a/docs/_posts/Ahmetemintek/2022-10-30-abbreviation_mapper_augmented_en.md b/docs/_posts/Ahmetemintek/2022-10-30-abbreviation_mapper_augmented_en.md new file mode 100644 index 00000000000000..0d65ce54e94258 --- /dev/null +++ b/docs/_posts/Ahmetemintek/2022-10-30-abbreviation_mapper_augmented_en.md @@ -0,0 +1,163 @@ +--- +layout: model +title: Mapping Abbreviations and Acronyms of Medical Regulatory Activities with Their Definitions (Augmented) +author: John Snow Labs +name: abbreviation_mapper_augmented +date: 2022-10-30 +tags: [abbreviation, definition, chunk_mapper, clinical, en, licensed] +task: Chunk Mapping +language: en +edition: Spark NLP for Healthcare 4.2.1 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This pretrained model maps abbreviations and acronyms of medical regulatory activities with their `definition`. This is an augmented version of the `abbreviation_mapper ` model with new abbreviations. + +## Predicted Entities + +`definition` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/26.Chunk_Mapping.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/abbreviation_mapper_augmented_en_4.2.1_3.0_1667127908106.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols("sentence")\ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") + +abbr_ner = MedicalNerModel.pretrained("ner_abbreviation_clinical", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("abbr_ner") + +abbr_converter = NerConverter() \ + .setInputCols(["sentence", "token", "abbr_ner"]) \ + .setOutputCol("abbr_ner_chunk")\ + +chunkerMapper = ChunkMapperModel.pretrained("abbreviation_mapper_augmented", "en", "clinical/models")\ + .setInputCols(["abbr_ner_chunk"])\ + .setOutputCol("mappings")\ + .setRels(["definition"])\ + +pipeline = Pipeline().setStages([ + document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + abbr_ner, + abbr_converter, + chunkerMapper]) + + +text = ["""Gravid with estimated fetal weight of 6-6/12 pounds. + LABORATORY DATA: Laboratory tests include a CBC which is normal. + VDRL: Nonreactive + HIV: Negative. One-Hour Glucose: 117. Group B strep has not been done as yet."""] + +data = spark.createDataFrame([text]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = new SentenceDetector() + .setInputCols(Array("document")) + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val abbr_ner = MedicalNerModel.pretrained("ner_abbreviation_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("abbr_ner") + +val abbr_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "abbr_ner")) + .setOutputCol("abbr_ner_chunk") + +val chunkerMapper = ChunkMapperModel.pretrained("abbreviation_mapper_augmented", "en", "clinical/models") + .setInputCols("abbr_ner_chunk") + .setOutputCol("mappings") + .setRels(Array("definition")) + + +val pipeline = new Pipeline().setStages(Array( + document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + abbr_ner, + abbr_converter, + chunkerMapper)) + + +val sample_text = """Gravid with estimated fetal weight of 6-6/12 pounds. + LABORATORY DATA: Laboratory tests include a CBC which is normal. + VDRL: Nonreactive + HIV: Negative. One-Hour Glucose: 117. Group B strep has not been done as yet.""" + + +val data = Seq(sample_text).toDS.toDF("text") + +val result= pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++---------+--------------------------------------+ +|ner_chunk|abbreviation | ++---------+--------------------------------------+ +|CBC |complete blood count | +|VDRL |Venereal Disease Research Laboratories| +|HIV |human immunodeficiency virus | ++---------+--------------------------------------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|abbreviation_mapper_augmented| +|Compatibility:|Spark NLP for Healthcare 4.2.1+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[abbr_ner_chunk]| +|Output Labels:|[mappings]| +|Language:|en| +|Size:|319.6 KB| \ No newline at end of file From d45a541b5865366949e2784f525694dd5d413089 Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Wed, 2 Nov 2022 19:32:53 +0700 Subject: [PATCH 33/57] Add model 2022-11-02-icd10cm_resolver_pipeline_en (#13017) Co-authored-by: Ahmetemintek --- ...2022-11-02-icd10cm_resolver_pipeline_en.md | 90 +++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 docs/_posts/Ahmetemintek/2022-11-02-icd10cm_resolver_pipeline_en.md diff --git a/docs/_posts/Ahmetemintek/2022-11-02-icd10cm_resolver_pipeline_en.md b/docs/_posts/Ahmetemintek/2022-11-02-icd10cm_resolver_pipeline_en.md new file mode 100644 index 00000000000000..ec39fb5e3eb570 --- /dev/null +++ b/docs/_posts/Ahmetemintek/2022-11-02-icd10cm_resolver_pipeline_en.md @@ -0,0 +1,90 @@ +--- +layout: model +title: Pipeline to Resolve ICD-10-CM Codes +author: John Snow Labs +name: icd10cm_resolver_pipeline +date: 2022-11-02 +tags: [en, clinical, licensed, resolver, chunk_mapping, pipeline, icd10cm] +task: Pipeline Healthcare +language: en +edition: Spark NLP for Healthcare 4.2.1 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This pretrained pipeline maps entities with their corresponding ICD-10-CM codes. You’ll just feed your text and it will return the corresponding ICD-10-CM codes. + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/3.Clinical_Entity_Resolvers.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/icd10cm_resolver_pipeline_en_4.2.1_3.0_1667389014041.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +from sparknlp.pretrained import PretrainedPipeline + +resolver_pipeline = PretrainedPipeline("icd10cm_resolver_pipeline", "en", "clinical/models") + +text = """A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years and anisakiasis. Also, it was reported that fetal and neonatal hemorrhage""" + +result = resolver_pipeline.fullAnnotate(text) +``` +```scala +import com.johnsnowlabs.nlp.pretrained.PretrainedPipeline + +val resolver_pipeline = new PretrainedPipeline("icd10cm_resolver_pipeline", "en", "clinical/models") + +val result = resolver_pipeline.fullAnnotate("""A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years and anisakiasis. Also, it was reported that fetal and neonatal hemorrhage""") +``` +
+ +## Results + +```bash ++-----------------------------+---------+------------+ +|chunk |ner_chunk|icd10cm_code| ++-----------------------------+---------+------------+ +|gestational diabetes mellitus|PROBLEM |O24.919 | +|anisakiasis |PROBLEM |B81.0 | +|fetal and neonatal hemorrhage|PROBLEM |P545 | ++-----------------------------+---------+------------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|icd10cm_resolver_pipeline| +|Type:|pipeline| +|Compatibility:|Spark NLP for Healthcare 4.2.1+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|3.5 GB| + +## Included Models + +- DocumentAssembler +- SentenceDetector +- TokenizerModel +- WordEmbeddingsModel +- MedicalNerModel +- NerConverter +- ChunkMapperModel +- ChunkMapperModel +- ChunkMapperFilterer +- Chunk2Doc +- BertSentenceEmbeddings +- SentenceEntityResolverModel +- ResolverMerger \ No newline at end of file From 452d8d395297b827053608f942989416d17eb59a Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Fri, 4 Nov 2022 00:45:02 +0700 Subject: [PATCH 34/57] Add model 2022-11-03-oncology_general_pipeline_en (#13031) Co-authored-by: mauro-nievoff --- ...2022-11-03-oncology_general_pipeline_en.md | 171 ++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100644 docs/_posts/mauro-nievoff/2022-11-03-oncology_general_pipeline_en.md diff --git a/docs/_posts/mauro-nievoff/2022-11-03-oncology_general_pipeline_en.md b/docs/_posts/mauro-nievoff/2022-11-03-oncology_general_pipeline_en.md new file mode 100644 index 00000000000000..1e828496a01991 --- /dev/null +++ b/docs/_posts/mauro-nievoff/2022-11-03-oncology_general_pipeline_en.md @@ -0,0 +1,171 @@ +--- +layout: model +title: General Oncology Pipeline +author: John Snow Labs +name: oncology_general_pipeline +date: 2022-11-03 +tags: [licensed, oncology, clinical, en] +task: Named Entity Recognition +language: en +edition: Spark NLP for Healthcare 4.1.0 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This pipeline includes Named-Entity Recognition, Assertion Status and Relation Extraction models to extract information from oncology texts. This pipeline extracts diagnoses, treatments, tests, anatomical references and demographic entities. + +{:.btn-box} + +[Open in Colab](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/oncology_general_pipeline_en_4.1.0_3.0_1667489644241.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +from sparknlp.pretrained import PretrainedPipeline + +pipeline = PretrainedPipeline("oncology_general_pipeline", "en", "clinical/models") + +pipeline.fullAnnotate("The patient underwent a left mastectomy for a left breast cancer two months ago. +The tumor is positive for ER and PR.")[0] +``` +```scala +import com.johnsnowlabs.nlp.pretrained.PretrainedPipeline + +val pipeline = new PretrainedPipeline("oncology_general_pipeline", "en", "clinical/models") + +val result = pipeline.fullAnnotate("""The patient underwent a left mastectomy for a left breast cancer two months ago. +The tumor is positive for ER and PR.""")(0) +``` +
+ +## Results + +```bash +******************** ner_oncology_wip results ******************** + +| chunk | ner_label | +|:---------------|:-----------------| +| left | Direction | +| mastectomy | Cancer_Surgery | +| left | Direction | +| breast cancer | Cancer_Dx | +| two months ago | Relative_Date | +| tumor | Tumor_Finding | +| positive | Biomarker_Result | +| ER | Biomarker | +| PR | Biomarker | + + +******************** ner_oncology_diagnosis_wip results ******************** + +| chunk | ner_label | +|:--------------|:--------------| +| breast cancer | Cancer_Dx | +| tumor | Tumor_Finding | + + +******************** ner_oncology_tnm_wip results ******************** + +| chunk | ner_label | +|:--------------|:------------| +| breast cancer | Cancer_Dx | +| tumor | Tumor | + + +******************** ner_oncology_therapy_wip results ******************** + +| chunk | ner_label | +|:-----------|:---------------| +| mastectomy | Cancer_Surgery | + + +******************** ner_oncology_test_wip results ******************** + +| chunk | ner_label | +|:---------|:-----------------| +| positive | Biomarker_Result | +| ER | Biomarker | +| PR | Biomarker | + + +******************** assertion_oncology_wip results ******************** + +| chunk | ner_label | assertion | +|:--------------|:---------------|:------------| +| mastectomy | Cancer_Surgery | Past | +| breast cancer | Cancer_Dx | Present | +| tumor | Tumor_Finding | Present | +| ER | Biomarker | Present | +| PR | Biomarker | Present | + + +******************** re_oncology_wip results ******************** + +| chunk1 | entity1 | chunk2 | entity2 | relation | +|:--------------|:-----------------|:---------------|:--------------|:--------------| +| mastectomy | Cancer_Surgery | two months ago | Relative_Date | is_related_to | +| breast cancer | Cancer_Dx | two months ago | Relative_Date | is_related_to | +| tumor | Tumor_Finding | ER | Biomarker | O | +| tumor | Tumor_Finding | PR | Biomarker | O | +| positive | Biomarker_Result | ER | Biomarker | is_related_to | +| positive | Biomarker_Result | PR | Biomarker | is_related_to | + + +******************** re_oncology_granular_wip results ******************** + +| chunk1 | entity1 | chunk2 | entity2 | relation | +|:--------------|:-----------------|:---------------|:--------------|:--------------| +| mastectomy | Cancer_Surgery | two months ago | Relative_Date | is_date_of | +| breast cancer | Cancer_Dx | two months ago | Relative_Date | is_date_of | +| tumor | Tumor_Finding | ER | Biomarker | O | +| tumor | Tumor_Finding | PR | Biomarker | O | +| positive | Biomarker_Result | ER | Biomarker | is_finding_of | +| positive | Biomarker_Result | PR | Biomarker | is_finding_of | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|oncology_general_pipeline| +|Type:|pipeline| +|Compatibility:|Spark NLP for Healthcare 4.1.0+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|1.7 GB| + +## Included Models + +- DocumentAssembler +- SentenceDetectorDLModel +- TokenizerModel +- WordEmbeddingsModel +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- ChunkMergeModel +- ChunkMergeModel +- AssertionDLModel +- PerceptronModel +- DependencyParserModel +- RelationExtractionModel +- RelationExtractionModel \ No newline at end of file From 9a009b7b0b5b6f17fd56f1eb295738c765247c98 Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Thu, 10 Nov 2022 20:07:02 +0700 Subject: [PATCH 35/57] 2022-11-04-oncology_diagnosis_pipeline_en (#13038) * Add model 2022-11-04-oncology_diagnosis_pipeline_en * Add model 2022-11-04-oncology_biomarker_pipeline_en * Add model 2022-11-04-oncology_therapy_pipeline_en * Update 2022-11-04-oncology_diagnosis_pipeline_en.md * Update 2022-11-04-oncology_diagnosis_pipeline_en.md * Update 2022-11-04-oncology_diagnosis_pipeline_en.md Co-authored-by: mauro-nievoff Co-authored-by: mauro-nievoff <55700369+mauro-nievoff@users.noreply.github.com> Co-authored-by: muhammetsnts <76607915+muhammetsnts@users.noreply.github.com> --- ...22-11-04-oncology_biomarker_pipeline_en.md | 216 ++++++++++++++++++ ...22-11-04-oncology_diagnosis_pipeline_en.md | 187 +++++++++++++++ ...2022-11-04-oncology_therapy_pipeline_en.md | 140 ++++++++++++ 3 files changed, 543 insertions(+) create mode 100644 docs/_posts/mauro-nievoff/2022-11-04-oncology_biomarker_pipeline_en.md create mode 100644 docs/_posts/mauro-nievoff/2022-11-04-oncology_diagnosis_pipeline_en.md create mode 100644 docs/_posts/mauro-nievoff/2022-11-04-oncology_therapy_pipeline_en.md diff --git a/docs/_posts/mauro-nievoff/2022-11-04-oncology_biomarker_pipeline_en.md b/docs/_posts/mauro-nievoff/2022-11-04-oncology_biomarker_pipeline_en.md new file mode 100644 index 00000000000000..c01942b90b3839 --- /dev/null +++ b/docs/_posts/mauro-nievoff/2022-11-04-oncology_biomarker_pipeline_en.md @@ -0,0 +1,216 @@ +--- +layout: model +title: Oncology Pipeline for Biomarkers +author: John Snow Labs +name: oncology_biomarker_pipeline +date: 2022-11-04 +tags: [licensed, en, oncology] +task: Named Entity Recognition +language: en +edition: Spark NLP for Healthcare 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This pipeline includes Named-Entity Recognition, Assertion Status and Relation Extraction models to extract information from oncology texts. This pipeline focuses on entities related to biomarkers. + +{:.btn-box} + +[Open in Colab](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/oncology_biomarker_pipeline_en_4.2.2_3.0_1667581643291.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +from sparknlp.pretrained import PretrainedPipeline + +pipeline = PretrainedPipeline("oncology_biomarker_pipeline", "en", "clinical/models") + +pipeline.fullAnnotate("Immunohistochemistry was negative for thyroid transcription factor-1 and napsin A. The test was positive for ER and PR, and negative for HER2.")[0] +``` +```scala +import com.johnsnowlabs.nlp.pretrained.PretrainedPipeline + +val pipeline = new PretrainedPipeline("oncology_biomarker_pipeline", "en", "clinical/models") + +val result = pipeline.fullAnnotate("""Immunohistochemistry was negative for thyroid transcription factor-1 and napsin A. The test was positive for ER and PR, and negative for HER2.""")(0) + +``` +
+ +## Results + +```bash +******************** ner_oncology_wip results ******************** + +| chunk | ner_label | +|:-------------------------------|:-----------------| +| negative | Biomarker_Result | +| thyroid transcription factor-1 | Biomarker | +| napsin | Biomarker | +| positive | Biomarker_Result | +| ER | Biomarker | +| PR | Biomarker | +| negative | Biomarker_Result | +| HER2 | Oncogene | + + +******************** ner_oncology_biomarker_wip results ******************** + +| chunk | ner_label | +|:-------------------------------|:-----------------| +| negative | Biomarker_Result | +| thyroid transcription factor-1 | Biomarker | +| napsin A | Biomarker | +| positive | Biomarker_Result | +| ER | Biomarker | +| PR | Biomarker | +| negative | Biomarker_Result | +| HER2 | Biomarker | + + +******************** ner_oncology_test_wip results ******************** + +| chunk | ner_label | +|:-------------------------------|:-----------------| +| Immunohistochemistry | Pathology_Test | +| negative | Biomarker_Result | +| thyroid transcription factor-1 | Biomarker | +| napsin A | Biomarker | +| positive | Biomarker_Result | +| ER | Biomarker | +| PR | Biomarker | +| negative | Biomarker_Result | +| HER2 | Oncogene | + + +******************** ner_biomarker results ******************** + +| chunk | ner_label | +|:-------------------------------|:----------------------| +| Immunohistochemistry | Test | +| negative | Biomarker_Measurement | +| thyroid transcription factor-1 | Biomarker | +| napsin A | Biomarker | +| positive | Biomarker_Measurement | +| ER | Biomarker | +| PR | Biomarker | +| negative | Biomarker_Measurement | +| HER2 | Biomarker | + + +******************** assertion_oncology_wip results ******************** + +| chunk | ner_label | assertion | +|:-------------------------------|:---------------|:------------| +| Immunohistochemistry | Pathology_Test | Past | +| thyroid transcription factor-1 | Biomarker | Present | +| napsin A | Biomarker | Present | +| ER | Biomarker | Present | +| PR | Biomarker | Present | +| HER2 | Oncogene | Present | + + +******************** assertion_oncology_test_binary_wip results ******************** + +| chunk | ner_label | assertion | +|:-------------------------------|:---------------|:----------------| +| Immunohistochemistry | Pathology_Test | Medical_History | +| thyroid transcription factor-1 | Biomarker | Medical_History | +| napsin A | Biomarker | Medical_History | +| ER | Biomarker | Medical_History | +| PR | Biomarker | Medical_History | +| HER2 | Oncogene | Medical_History | + + +******************** re_oncology_wip results ******************** + +| chunk1 | entity1 | chunk2 | entity2 | relation | +|:---------------------|:-----------------|:-------------------------------|:-----------------|:--------------| +| Immunohistochemistry | Pathology_Test | negative | Biomarker_Result | O | +| negative | Biomarker_Result | thyroid transcription factor-1 | Biomarker | is_related_to | +| negative | Biomarker_Result | napsin A | Biomarker | is_related_to | +| positive | Biomarker_Result | ER | Biomarker | is_related_to | +| positive | Biomarker_Result | PR | Biomarker | is_related_to | +| positive | Biomarker_Result | HER2 | Oncogene | O | +| ER | Biomarker | negative | Biomarker_Result | O | +| PR | Biomarker | negative | Biomarker_Result | O | +| negative | Biomarker_Result | HER2 | Oncogene | is_related_to | + + +******************** re_oncology_granular_wip results ******************** + +| chunk1 | entity1 | chunk2 | entity2 | relation | +|:---------------------|:-----------------|:-------------------------------|:-----------------|:--------------| +| Immunohistochemistry | Pathology_Test | negative | Biomarker_Result | O | +| negative | Biomarker_Result | thyroid transcription factor-1 | Biomarker | is_finding_of | +| negative | Biomarker_Result | napsin A | Biomarker | is_finding_of | +| positive | Biomarker_Result | ER | Biomarker | is_finding_of | +| positive | Biomarker_Result | PR | Biomarker | is_finding_of | +| positive | Biomarker_Result | HER2 | Oncogene | is_finding_of | +| ER | Biomarker | negative | Biomarker_Result | O | +| PR | Biomarker | negative | Biomarker_Result | O | +| negative | Biomarker_Result | HER2 | Oncogene | is_finding_of | + + +******************** re_oncology_biomarker_result_wip results ******************** + +| chunk1 | entity1 | chunk2 | entity2 | relation | +|:---------------------|:-----------------|:-------------------------------|:-----------------|:--------------| +| Immunohistochemistry | Pathology_Test | negative | Biomarker_Result | is_finding_of | +| negative | Biomarker_Result | thyroid transcription factor-1 | Biomarker | is_finding_of | +| negative | Biomarker_Result | napsin A | Biomarker | is_finding_of | +| positive | Biomarker_Result | ER | Biomarker | is_finding_of | +| positive | Biomarker_Result | PR | Biomarker | is_finding_of | +| positive | Biomarker_Result | HER2 | Oncogene | O | +| ER | Biomarker | negative | Biomarker_Result | O | +| PR | Biomarker | negative | Biomarker_Result | O | +| negative | Biomarker_Result | HER2 | Oncogene | is_finding_of | + +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|oncology_biomarker_pipeline| +|Type:|pipeline| +|Compatibility:|Spark NLP for Healthcare 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|1.7 GB| + +## Included Models + +- DocumentAssembler +- SentenceDetectorDLModel +- TokenizerModel +- WordEmbeddingsModel +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- ChunkMergeModel +- ChunkMergeModel +- AssertionDLModel +- AssertionDLModel +- PerceptronModel +- DependencyParserModel +- RelationExtractionModel +- RelationExtractionModel +- RelationExtractionModel \ No newline at end of file diff --git a/docs/_posts/mauro-nievoff/2022-11-04-oncology_diagnosis_pipeline_en.md b/docs/_posts/mauro-nievoff/2022-11-04-oncology_diagnosis_pipeline_en.md new file mode 100644 index 00000000000000..4d82872e76a195 --- /dev/null +++ b/docs/_posts/mauro-nievoff/2022-11-04-oncology_diagnosis_pipeline_en.md @@ -0,0 +1,187 @@ +--- +layout: model +title: Oncology Pipeline for Diagnosis Entities +author: John Snow Labs +name: oncology_diagnosis_pipeline +date: 2022-11-04 +tags: [licensed, en, oncology, clinical] +task: Named Entity Recognition +language: en +edition: Spark NLP for Healthcare 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This pipeline includes Named-Entity Recognition, Assertion Status, Relation Extraction and Entity Resolution models to extract information from oncology texts. This pipeline focuses on entities related to oncological diagnosis. + +{:.btn-box} + +[Open in Colab](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/oncology_diagnosis_pipeline_en_4.2.2_3.0_1667569522240.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +from sparknlp.pretrained import PretrainedPipeline + +pipeline = PretrainedPipeline("oncology_diagnosis_pipeline", "en", "clinical/models") + +pipeline.fullAnnotate("Two years ago, the patient presented with a 4-cm tumor in her left breast. She was diagnosed with ductal carcinoma. According to her last CT, she has no lung metastases.")[0] +``` +```scala +import com.johnsnowlabs.nlp.pretrained.PretrainedPipeline + +val pipeline = new PretrainedPipeline("oncology_diagnosis_pipeline", "en", "clinical/models") + +val result = pipeline.fullAnnotate("""Two years ago, the patient presented with a 4-cm tumor in her left breast. She was diagnosed with ductal carcinoma. According to her last CT, she has no lung metastases.""")(0) +``` +
+ +## Results + +```bash +******************** ner_oncology_wip results ******************** + +| chunk | ner_label | +|:-----------|:------------------| +| 4-cm | Tumor_Size | +| tumor | Tumor_Finding | +| left | Direction | +| breast | Site_Breast | +| ductal | Histological_Type | +| carcinoma | Cancer_Dx | +| lung | Site_Lung | +| metastases | Metastasis | + + +******************** ner_oncology_diagnosis_wip results ******************** + +| chunk | ner_label | +|:-----------|:------------------| +| 4-cm | Tumor_Size | +| tumor | Tumor_Finding | +| ductal | Histological_Type | +| carcinoma | Cancer_Dx | +| metastases | Metastasis | + + +******************** ner_oncology_tnm_wip results ******************** + +| chunk | ner_label | +|:-----------|:------------------| +| 4-cm | Tumor_Description | +| tumor | Tumor | +| ductal | Tumor_Description | +| carcinoma | Cancer_Dx | +| metastases | Metastasis | + + +******************** assertion_oncology_wip results ******************** + +| chunk | ner_label | assertion | +|:-----------|:------------------|:------------| +| tumor | Tumor_Finding | Present | +| ductal | Histological_Type | Present | +| carcinoma | Cancer_Dx | Present | +| metastases | Metastasis | Absent | + + +******************** assertion_oncology_problem_wip results ******************** + +| chunk | ner_label | assertion | +|:-----------|:------------------|:-----------------------| +| tumor | Tumor_Finding | Medical_History | +| ductal | Histological_Type | Medical_History | +| carcinoma | Cancer_Dx | Medical_History | +| metastases | Metastasis | Hypothetical_Or_Absent | + + +******************** re_oncology_wip results ******************** + +| chunk1 | entity1 | chunk2 | entity2 | relation | +|:---------|:--------------|:-----------|:--------------|:--------------| +| 4-cm | Tumor_Size | tumor | Tumor_Finding | is_related_to | +| 4-cm | Tumor_Size | carcinoma | Cancer_Dx | O | +| tumor | Tumor_Finding | breast | Site_Breast | is_related_to | +| breast | Site_Breast | carcinoma | Cancer_Dx | O | +| lung | Site_Lung | metastases | Metastasis | is_related_to | + + +******************** re_oncology_granular_wip results ******************** + +| chunk1 | entity1 | chunk2 | entity2 | relation | +|:---------|:--------------|:-----------|:--------------|:---------------| +| 4-cm | Tumor_Size | tumor | Tumor_Finding | is_size_of | +| 4-cm | Tumor_Size | carcinoma | Cancer_Dx | O | +| tumor | Tumor_Finding | breast | Site_Breast | is_location_of | +| breast | Site_Breast | carcinoma | Cancer_Dx | O | +| lung | Site_Lung | metastases | Metastasis | is_location_of | + + +******************** re_oncology_size_wip results ******************** + +| chunk1 | entity1 | chunk2 | entity2 | relation | +|:---------|:-----------|:----------|:--------------|:-----------| +| 4-cm | Tumor_Size | tumor | Tumor_Finding | is_size_of | +| 4-cm | Tumor_Size | carcinoma | Cancer_Dx | O | + + +******************** ICD-O resolver results ******************** + +| chunk | ner_label | code | normalized_term | +|:-----------|:------------------|:-------|:------------------| +| tumor | Tumor_Finding | 8000/1 | tumor | +| breast | Site_Breast | C50 | breast | +| ductal | Histological_Type | 8500/2 | dcis | +| carcinoma | Cancer_Dx | 8010/3 | carcinoma | +| lung | Site_Lung | C34.9 | lung | +| metastases | Metastasis | 8000/6 | tumor, metastatic | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|oncology_diagnosis_pipeline| +|Type:|pipeline| +|Compatibility:|Spark NLP for Healthcare 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|2.3 GB| + +## Included Models + +- DocumentAssembler +- SentenceDetectorDLModel +- TokenizerModel +- WordEmbeddingsModel +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- ChunkMergeModel +- ChunkMergeModel +- AssertionDLModel +- AssertionDLModel +- PerceptronModel +- DependencyParserModel +- RelationExtractionModel +- RelationExtractionModel +- RelationExtractionModel +- ChunkMergeModel +- Chunk2Doc +- BertSentenceEmbeddings +- SentenceEntityResolverModel diff --git a/docs/_posts/mauro-nievoff/2022-11-04-oncology_therapy_pipeline_en.md b/docs/_posts/mauro-nievoff/2022-11-04-oncology_therapy_pipeline_en.md new file mode 100644 index 00000000000000..4e0859c800f4dc --- /dev/null +++ b/docs/_posts/mauro-nievoff/2022-11-04-oncology_therapy_pipeline_en.md @@ -0,0 +1,140 @@ +--- +layout: model +title: Oncology Pipeline for Therapies +author: John Snow Labs +name: oncology_therapy_pipeline +date: 2022-11-04 +tags: [licensed, clinical, en] +task: Named Entity Recognition +language: en +edition: Spark NLP for Healthcare 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This pipeline includes Named-Entity Recognition and Assertion Status models to extract information from oncology texts. This pipeline focuses on entities related to therapies. + +{:.btn-box} + +[Open in Colab](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/oncology_therapy_pipeline_en_4.2.2_3.0_1667593592479.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +from sparknlp.pretrained import PretrainedPipeline + +pipeline = PretrainedPipeline("oncology_therapy_pipeline", "en", "clinical/models") + +pipeline.fullAnnotate("The patient underwent a mastectomy two years ago. She is currently receiving her second cycle of adriamycin and cyclophosphamide, and is in good overall condition.")[0] +``` +```scala +import com.johnsnowlabs.nlp.pretrained.PretrainedPipeline + +val pipeline = new PretrainedPipeline("oncology_therapy_pipeline", "en", "clinical/models") + +val result = pipeline.fullAnnotate("""The patient underwent a mastectomy two years ago. She is currently receiving her second cycle of adriamycin and cyclophosphamide, and is in good overall condition.""")(0) +``` +
+ +## Results + +```bash +******************** ner_oncology_wip results ******************** + +| chunk | ner_label | +|:-----------------|:---------------| +| mastectomy | Cancer_Surgery | +| second cycle | Cycle_Number | +| adriamycin | Chemotherapy | +| cyclophosphamide | Chemotherapy | + + +******************** ner_oncology_wip results ******************** + +| chunk | ner_label | +|:-----------------|:---------------| +| mastectomy | Cancer_Surgery | +| second cycle | Cycle_Number | +| adriamycin | Chemotherapy | +| cyclophosphamide | Chemotherapy | + + +******************** ner_oncology_wip results ******************** + +| chunk | ner_label | +|:-----------------|:---------------| +| mastectomy | Cancer_Surgery | +| second cycle | Cycle_Number | +| adriamycin | Cancer_Therapy | +| cyclophosphamide | Cancer_Therapy | + + +******************** ner_oncology_unspecific_posology_wip results ******************** + +| chunk | ner_label | +|:-----------------|:---------------------| +| mastectomy | Cancer_Therapy | +| second cycle | Posology_Information | +| adriamycin | Cancer_Therapy | +| cyclophosphamide | Cancer_Therapy | + + +******************** assertion_oncology_wip results ******************** + +| chunk | ner_label | assertion | +|:-----------------|:---------------|:------------| +| mastectomy | Cancer_Surgery | Past | +| adriamycin | Chemotherapy | Present | +| cyclophosphamide | Chemotherapy | Present | + + +******************** assertion_oncology_treatment_binary_wip results ******************** + +| chunk | ner_label | assertion | +|:-----------------|:---------------|:----------------| +| mastectomy | Cancer_Surgery | Present_Or_Past | +| adriamycin | Chemotherapy | Present_Or_Past | +| cyclophosphamide | Chemotherapy | Present_Or_Past | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|oncology_therapy_pipeline| +|Type:|pipeline| +|Compatibility:|Spark NLP for Healthcare 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|1.7 GB| + +## Included Models + +- DocumentAssembler +- SentenceDetectorDLModel +- TokenizerModel +- WordEmbeddingsModel +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- ChunkMergeModel +- ChunkMergeModel +- AssertionDLModel +- AssertionDLModel \ No newline at end of file From ddd470a01412cf577a114ce61e713379285e250d Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Tue, 15 Nov 2022 23:09:46 +0700 Subject: [PATCH 36/57] 2022-11-15-ner_sdoh_slim_wip_en (#13088) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add model 2022-11-15-ner_sdoh_slim_wip_en * Update 2022-11-15-ner_sdoh_slim_wip_en.md * Update 2022-11-15-ner_sdoh_slim_wip_en.md * Update 2022-11-15-ner_sdoh_slim_wip_en.md Co-authored-by: gokhanturer Co-authored-by: Gökhan <81560784+gokhanturer@users.noreply.github.com> Co-authored-by: Cabir C <64752006+Cabir40@users.noreply.github.com> --- .../2022-11-15-ner_sdoh_slim_wip_en.md | 259 ++++++++++++++++++ 1 file changed, 259 insertions(+) create mode 100644 docs/_posts/gokhanturer/2022-11-15-ner_sdoh_slim_wip_en.md diff --git a/docs/_posts/gokhanturer/2022-11-15-ner_sdoh_slim_wip_en.md b/docs/_posts/gokhanturer/2022-11-15-ner_sdoh_slim_wip_en.md new file mode 100644 index 00000000000000..8a4e6047753ec9 --- /dev/null +++ b/docs/_posts/gokhanturer/2022-11-15-ner_sdoh_slim_wip_en.md @@ -0,0 +1,259 @@ +--- +layout: model +title: Social Determinants of Health (slim) +author: John Snow Labs +name: ner_sdoh_slim_wip +date: 2022-11-15 +tags: [en, licensed, sdoh, social_determinants, public_health, clinical] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.1 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts terminology related to `Social Determinants of Health ` from various kinds of biomedical documents. + +## Predicted Entities + +`Housing`, `Smoking`, `Substance_Frequency`, `Childhood_Development`, `Age`, `Other_Disease`, `Employment`, `Marital_Status`, `Diet`, `Disability`, `Mental_Health`, `Alcohol`, `Substance_Quantity`, `Family_Member`, `Race_Ethnicity`, `Gender`, `Geographic_Entity`, `Sexual_Orientation`, `Substance_Use` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_sdoh_slim_wip_en_4.2.1_3.0_1668524622964.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +clinical_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") + +ner_model = MedicalNerModel.pretrained("ner_sdoh_slim_wip", "en", "clinical/models")\ + .setInputCols(["sentence", "token", "embeddings"])\ + .setOutputCol("ner") + +ner_converter = NerConverter()\ + .setInputCols(["sentence","token","ner"])\ + .setOutputCol("ner_chunk") + +nlpPipeline = Pipeline(stages=[ + documentAssembler, + sentenceDetector, + tokenizer, + clinical_embeddings, + ner_model, + ner_converter] + +text = [""" Mother states that he does smoke, there is a family hx of alcohol on both maternal and paternal sides of the family, maternal grandfather who died of alcohol related complications and paternal grandmother with severe alcoholism. Pts own drinking began at age 16, living in LA, had a DUI at age 17 after totaling a new car that his mother bought for him, he was married. """] + +data = spark.createDataFrame([text]).toDF("text") + +result = nlpPipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val clinical_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") + +val ner_model = MedicalNerModel.pretrained("ner_sdoh_slim_wip", "en", "clinical/models")\ + .setInputCols(["sentence", "token", "embeddings"])\ + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val nlpPipeline = new PipelineModel().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + clinical_embeddings, + ner_model, + ner_converter)) + +val data = Seq("""Mother states that there is a family hx of alcohol on both maternal and paternal sides of the family, maternal grandfather who died of alcohol related complications and paternal grandmother with severe alcoholism. Pts own drinking began at age 16, had a DUI at age 17 after totaling a new car that his mother bought for him, he was married.""").toDS.toDF("text") + +val result = nlpPipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++-------------+-------------------+ +| token| ner_label| ++-------------+-------------------+ +| Mother| B-Family_Member| +| states| O| +| that| O| +| he| B-Gender| +| does| O| +| smoke| B-Smoking| +| ,| O| +| there| O| +| is| O| +| a| O| +| family| O| +| hx| O| +| of| O| +| alcohol| B-Alcohol| +| on| O| +| both| O| +| maternal| B-Family_Member| +| and| O| +| paternal| B-Family_Member| +| sides| O| +| of| O| +| the| O| +| family| O| +| ,| O| +| maternal| B-Family_Member| +| grandfather| B-Family_Member| +| who| O| +| died| O| +| of| O| +| alcohol| B-Alcohol| +| related| O| +|complications| O| +| and| O| +| paternal| B-Family_Member| +| grandmother| B-Family_Member| +| with| O| +| severe| B-Alcohol| +| alcoholism| I-Alcohol| +| .| O| +| Pts| O| +| own| O| +| drinking| B-Alcohol| +| began| O| +| at| O| +| age| B-Age| +| 16| I-Age| +| ,| O| +| living| O| +| in| O| +| LA|B-Geographic_Entity| +| ,| O| +| had| O| +| a| O| +| DUI| O| +| at| O| +| age| O| +| 17| O| +| after| O| +| totaling| O| +| a| O| +| new| O| +| car| O| +| that| O| +| his| B-Gender| +| mother| B-Family_Member| +| bought| O| +| for| O| +| him| B-Gender| +| ,| O| +| he| B-Gender| +| was| O| +| married| B-Marital_Status| +| .| O| ++-------------+-------------------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_sdoh_slim_wip| +|Compatibility:|Healthcare NLP 4.2.1+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|2.8 MB| + +## References + +Manuel annotations have been made over [MTSamples](https://mtsamples.com/) and [MIMIC ](https://physionet.org/content/mimiciii/1.4/) datasets. + +## Benchmarking + +```bash + label precision recall f1-score support + B-Age 0.93 0.90 0.91 277 + B-Alcohol 0.90 0.88 0.89 410 +B-Childhood_Development 1.00 1.00 1.00 1 + B-Diet 1.00 1.00 1.00 6 + B-Disability 0.96 0.95 0.96 57 + B-Employment 0.91 0.79 0.85 1926 + B-Family_Member 0.93 0.97 0.95 2412 + B-Gender 0.97 0.99 0.98 6161 + B-Geographic_Entity 0.81 0.79 0.80 82 + B-Housing 0.82 0.73 0.77 183 + B-Marital_Status 0.93 0.91 0.92 184 + B-Mental_Health 0.85 0.72 0.78 487 + B-Other_Disease 0.77 0.82 0.79 381 + B-Race_Ethnicity 0.91 0.94 0.93 34 + B-Sexual_Orientation 0.75 0.90 0.82 10 + B-Smoking 0.96 0.96 0.96 209 + B-Substance_Frequency 0.92 0.88 0.90 88 + B-Substance_Quantity 0.83 0.79 0.81 72 + B-Substance_Use 0.80 0.82 0.81 213 + I-Age 0.91 0.95 0.93 589 + I-Alcohol 0.80 0.77 0.79 159 +I-Childhood_Development 1.00 1.00 1.00 3 + I-Diet 1.00 0.89 0.94 9 + I-Disability 1.00 0.53 0.70 15 + I-Employment 0.77 0.62 0.69 369 + I-Family_Member 0.79 0.84 0.81 138 + I-Gender 0.57 0.88 0.69 231 + I-Geographic_Entity 1.00 0.85 0.92 13 + I-Housing 0.86 0.83 0.84 362 + I-Marital_Status 1.00 0.18 0.31 11 + I-Mental_Health 0.81 0.47 0.59 241 + I-Other_Disease 0.75 0.74 0.75 256 + I-Race_Ethnicity 1.00 1.00 1.00 15 + I-Smoking 0.90 0.93 0.91 46 + I-Substance_Frequency 0.85 0.73 0.79 75 + I-Substance_Quantity 0.84 0.88 0.86 174 + I-Substance_Use 0.86 0.84 0.85 182 + O 0.99 0.99 0.99 148829 + accuracy - - 0.98 164910 + macro_avg 0.89 0.83 0.85 164910 + weighted_avg 0.98 0.98 0.98 164910 +``` From 6cc8988cd848f8a58a322d81115c9638eca6642e Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Wed, 16 Nov 2022 19:02:35 +0700 Subject: [PATCH 37/57] Add model 2022-11-16-abbreviation_category_mapper_en (#13095) Co-authored-by: Ahmetemintek --- ...2-11-16-abbreviation_category_mapper_en.md | 161 ++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 docs/_posts/Ahmetemintek/2022-11-16-abbreviation_category_mapper_en.md diff --git a/docs/_posts/Ahmetemintek/2022-11-16-abbreviation_category_mapper_en.md b/docs/_posts/Ahmetemintek/2022-11-16-abbreviation_category_mapper_en.md new file mode 100644 index 00000000000000..d86a109aa6e89a --- /dev/null +++ b/docs/_posts/Ahmetemintek/2022-11-16-abbreviation_category_mapper_en.md @@ -0,0 +1,161 @@ +--- +layout: model +title: Mapping Abbreviations and Acronyms of Medical Regulatory Activities with Their Definitions and Categories +author: John Snow Labs +name: abbreviation_category_mapper +date: 2022-11-16 +tags: [abbreviation, definition, category, licensed, en, clinical, chunk_mapper] +task: Chunk Mapping +language: en +edition: Healthcare NLP 4.2.1 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This pretrained model maps abbreviations and acronyms of medical regulatory activities with their definitions and categories. Predicted categories: `general`, `problem`, `test`, `treatment`, `medical_condition`, `clinical_dept`, `drug`, `nursing`, `internal_organ_or_component`, `hospital_unit`, `drug_frequency`, `employment`, `procedure` + +## Predicted Entities + +`definition`, `category` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/26.Chunk_Mapping.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/abbreviation_category_mapper_en_4.2.1_3.0_1668594867892.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols("sentence")\ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") + +abbr_ner = MedicalNerModel.pretrained("ner_abbreviation_clinical", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("abbr_ner") + +abbr_converter = NerConverter() \ + .setInputCols(["sentence", "token", "abbr_ner"]) \ + .setOutputCol("abbr_ner_chunk")\ + +chunkerMapper = ChunkMapperModel.pretrained("abbreviation_category_mapper", "en", "clinical/models")\ + .setInputCols(["abbr_ner_chunk"])\ + .setOutputCol("mappings")\ + .setRels(["definition", "category"])\ + +pipeline = Pipeline().setStages([ + document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + abbr_ner, + abbr_converter, + chunkerMapper]) + + +text = ["""Gravid with estimated fetal weight of 6-6/12 pounds. + LABORATORY DATA: Laboratory tests include a CBC which is normal. + VDRL: Nonreactive + HIV: Negative. One-Hour Glucose: 117. Group B strep has not been done as yet."""] + +data = spark.createDataFrame([text]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = new SentenceDetector() + .setInputCols(Array("document")) + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val abbr_ner = MedicalNerModel.pretrained("ner_abbreviation_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("abbr_ner") + +val abbr_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "abbr_ner")) + .setOutputCol("abbr_ner_chunk") + +val chunkerMapper = ChunkMapperModel.pretrained("abbreviation_category_mapper", "en", "clinical/models") + .setInputCols("abbr_ner_chunk") + .setOutputCol("mappings") + .setRels(Array("definition", "category")) + + +val pipeline = new Pipeline().setStages(Array( + document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + abbr_ner, + abbr_converter, + chunkerMapper)) + + +val sample_text = """Gravid with estimated fetal weight of 6-6/12 pounds. + LABORATORY DATA: Laboratory tests include a CBC which is normal. + VDRL: Nonreactive + HIV: Negative. One-Hour Glucose: 117. Group B strep has not been done as yet.""" + + +val data = Seq(sample_text).toDS.toDF("text") + +val result= pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| | chunk | category | definition | +|---:|:--------|:------------------|:---------------------------------------| +| 0 | CBC | general | complete blood count | +| 1 | VDRL | clinical_dept | Venereal Disease Research Laboratories | +| 2 | HIV | medical_condition | Human immunodeficiency virus | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|abbreviation_category_mapper| +|Compatibility:|Healthcare NLP 4.2.1+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[abbr_ner_chunk]| +|Output Labels:|[mappings]| +|Language:|en| +|Size:|128.2 KB| \ No newline at end of file From 8a3d1996166a2e57a0f3341f56e3e30b34a62242 Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Tue, 22 Nov 2022 06:17:52 +0700 Subject: [PATCH 38/57] 2022-11-18-kegg_disease_mapper_en (#13113) * Add model 2022-11-18-kegg_disease_mapper_en * Add model 2022-11-21-kegg_drug_mapper_en Co-authored-by: Ahmetemintek --- .../2022-11-18-kegg_disease_mapper_en.md | 158 +++++++++++++++++ .../2022-11-21-kegg_drug_mapper_en.md | 159 ++++++++++++++++++ 2 files changed, 317 insertions(+) create mode 100644 docs/_posts/Ahmetemintek/2022-11-18-kegg_disease_mapper_en.md create mode 100644 docs/_posts/Ahmetemintek/2022-11-21-kegg_drug_mapper_en.md diff --git a/docs/_posts/Ahmetemintek/2022-11-18-kegg_disease_mapper_en.md b/docs/_posts/Ahmetemintek/2022-11-18-kegg_disease_mapper_en.md new file mode 100644 index 00000000000000..3c98fb503b787f --- /dev/null +++ b/docs/_posts/Ahmetemintek/2022-11-18-kegg_disease_mapper_en.md @@ -0,0 +1,158 @@ +--- +layout: model +title: Mapping Diseases from the KEGG Database to Their Corresponding Categories, Descriptions and Clinical Vocabularies +author: John Snow Labs +name: kegg_disease_mapper +date: 2022-11-18 +tags: [disease, category, description, icd10, icd11, mesh, brite, en, clinical, chunk_mapper, licensed] +task: Chunk Mapping +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This pretrained model maps diseases with their corresponding `category`, `description`, `icd10_code`, `icd11_code`, `mesh_code`, and hierarchical `brite_code`. This model was trained with the data from the KEGG database. + +## Predicted Entities + +`category`, `description`, `icd10_code`, `icd11_code`, `mesh_code`, `brite_code` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/26.Chunk_Mapping.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/kegg_disease_mapper_en_4.2.2_3.0_1668794743905.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols("sentence")\ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_diseases", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk")\ + +chunkerMapper = ChunkMapperModel.pretrained("kegg_disease_mapper", "en", "clinical/models")\ + .setInputCols(["ner_chunk"])\ + .setOutputCol("mappings")\ + .setRels(["description", "category", "icd10_code", "icd11_code", "mesh_code", "brite_code"])\ + +pipeline = Pipeline().setStages([ + document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + converter, + chunkerMapper]) + + +text= "A 55-year-old female with a history of myopia, kniest dysplasia and prostate cancer. She was on glipizide , and dapagliflozin for congenital nephrogenic diabetes insipidus." + +data = spark.createDataFrame([[text]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = new SentenceDetector() + .setInputCols(Array("document")) + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_diseases", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val chunkerMapper = ChunkMapperModel.pretrained("kegg_disease_mapper", "en", "clinical/models") + .setInputCols("ner_chunk") + .setOutputCol("mappings") + .setRels(Array("description", "category", "icd10_code", "icd11_code", "mesh_code", "brite_code")) + + +val pipeline = new Pipeline().setStages(Array( + document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + converter, + chunkerMapper)) + + +val text= "A 55-year-old female with a history of myopia, kniest dysplasia and prostate cancer. She was on glipizide , and dapagliflozin for congenital nephrogenic diabetes insipidus." + + +val data = Seq(text).toDS.toDF("text") + +val result= pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++-----------------------------------------+--------------------------------------------------+-----------------------+----------+----------+---------+-----------------------+ +| ner_chunk| description| category|icd10_code|icd11_code|mesh_code| brite_code| ++-----------------------------------------+--------------------------------------------------+-----------------------+----------+----------+---------+-----------------------+ +| myopia|Myopia is the most common ocular disorder world...| Nervous system disease| H52.1| 9D00.0| D009216| 08402,08403| +| kniest dysplasia|Kniest dysplasia is an autosomal dominant chond...|Congenital malformation| Q77.7| LD24.3| C537207| 08402,08403| +| prostate cancer|Prostate cancer constitutes a major health prob...| Cancer| C61| 2C82| NONE|08402,08403,08442,08441| +|congenital nephrogenic diabetes insipidus|Nephrogenic diabetes insipidus (NDI) is charact...| Urinary system disease| N25.1| GB90.4A| D018500| 08402,08403| ++-----------------------------------------+--------------------------------------------------+-----------------------+----------+----------+---------+-----------------------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|kegg_disease_mapper| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[ner_chunk]| +|Output Labels:|[mappings]| +|Language:|en| +|Size:|595.6 KB| \ No newline at end of file diff --git a/docs/_posts/Ahmetemintek/2022-11-21-kegg_drug_mapper_en.md b/docs/_posts/Ahmetemintek/2022-11-21-kegg_drug_mapper_en.md new file mode 100644 index 00000000000000..519a2856f8b9e2 --- /dev/null +++ b/docs/_posts/Ahmetemintek/2022-11-21-kegg_drug_mapper_en.md @@ -0,0 +1,159 @@ +--- +layout: model +title: Mapping Drugs from the KEGG Database to Their Efficacies, Molecular Weights and Corresponding Codes from Other Databases +author: John Snow Labs +name: kegg_drug_mapper +date: 2022-11-21 +tags: [drug, efficacy, molecular_weight, cas, pubchem, chebi, ligandbox, nikkaji, pdbcct, chunk_mapper, clinical, en, licensed] +task: Chunk Mapping +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This pretrained model maps drugs with their corresponding `efficacy`, `molecular_weight` as well as `CAS`, `PubChem`, `ChEBI`, `LigandBox`, `NIKKAJI`, `PDB-CCD` codes. This model was trained with the data from the KEGG database. + +## Predicted Entities + +`efficacy`, `molecular_weight`, `CAS`, `PubChem`, `ChEBI`, `LigandBox`, `NIKKAJI`, `PDB-CCD` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/26.Chunk_Mapping.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/kegg_drug_mapper_en_4.2.2_3.0_1669069910375.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols("sentence")\ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_posology", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk")\ + +chunkerMapper = ChunkMapperModel.pretrained("kegg_drug_mapper", "en", "clinical/models")\ + .setInputCols(["ner_chunk"])\ + .setOutputCol("mappings")\ + .setRels(["efficacy", "molecular_weight", "CAS", "PubChem", "ChEBI", "LigandBox", "NIKKAJI", "PDB-CCD"])\ + +pipeline = Pipeline().setStages([ + document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + converter, + chunkerMapper]) + + +text= "She is given OxyContin, folic acid, levothyroxine, Norvasc, aspirin, Neurontin" + +data = spark.createDataFrame([[text]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = new SentenceDetector() + .setInputCols(Array("document")) + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_posology", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val chunkerMapper = ChunkMapperModel.pretrained("kegg_drug_mapper", "en", "clinical/models") + .setInputCols("ner_chunk") + .setOutputCol("mappings") + .setRels(Array(["efficacy", "molecular_weight", "CAS", "PubChem", "ChEBI", "LigandBox", "NIKKAJI", "PDB-CCD"])) + + +val pipeline = new Pipeline().setStages(Array( + document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + converter, + chunkerMapper)) + + +val text= "She is given OxyContin, folic acid, levothyroxine, Norvasc, aspirin, Neurontin" + +val data = Seq(text).toDS.toDF("text") + +val result= pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++-------------+--------------------------------------------------+----------------+----------+-----------+-------+---------+---------+-------+ +| ner_chunk| efficacy|molecular_weight| CAS| PubChem| ChEBI|LigandBox| NIKKAJI|PDB-CCD| ++-------------+--------------------------------------------------+----------------+----------+-----------+-------+---------+---------+-------+ +| OxyContin| Analgesic (narcotic), Opioid receptor agonist| 351.8246| 124-90-3| 7847912.0| 7859.0| D00847|J281.239H| NONE| +| folic acid|Anti-anemic, Hematopoietic, Supplement (folic a...| 441.3975| 59-30-3| 7847138.0|27470.0| D00070| J1.392G| FOL| +|levothyroxine| Replenisher (thyroid hormone)| 776.87| 51-48-9|9.6024815E7|18332.0| D08125| J4.118A| T44| +| Norvasc|Antihypertensive, Vasodilator, Calcium channel ...| 408.8759|88150-42-9|5.1091781E7| 2668.0| D07450| J33.383B| NONE| +| aspirin|Analgesic, Anti-inflammatory, Antipyretic, Anti...| 180.1574| 50-78-2| 7847177.0|15365.0| D00109| J2.300K| AIN| +| Neurontin| Anticonvulsant, Antiepileptic| 171.2368|60142-96-3| 7847398.0|42797.0| D00332| J39.388F| GBN| ++-------------+--------------------------------------------------+----------------+----------+-----------+-------+---------+---------+-------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|kegg_drug_mapper| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[ner_chunk]| +|Output Labels:|[mappings]| +|Language:|en| +|Size:|1.0 MB| \ No newline at end of file From 45b58cdde66254936bba803464d8e7908c639932 Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Tue, 22 Nov 2022 23:01:11 +0700 Subject: [PATCH 39/57] 2022-11-22-ner_deid_generic_bert_ro (#13121) * Add model 2022-11-22-ner_deid_generic_bert_ro * Add model 2022-11-22-ner_clinical_bert_ro * Add model 2022-11-22-ner_living_species_300_es Co-authored-by: Cabir40 --- .../2022-11-22-ner_clinical_bert_ro.md | 187 ++++++++++++++++++ .../2022-11-22-ner_deid_generic_bert_ro.md | 173 ++++++++++++++++ .../2022-11-22-ner_living_species_300_es.md | 166 ++++++++++++++++ 3 files changed, 526 insertions(+) create mode 100644 docs/_posts/Cabir40/2022-11-22-ner_clinical_bert_ro.md create mode 100644 docs/_posts/Cabir40/2022-11-22-ner_deid_generic_bert_ro.md create mode 100644 docs/_posts/Cabir40/2022-11-22-ner_living_species_300_es.md diff --git a/docs/_posts/Cabir40/2022-11-22-ner_clinical_bert_ro.md b/docs/_posts/Cabir40/2022-11-22-ner_clinical_bert_ro.md new file mode 100644 index 00000000000000..71def211c23334 --- /dev/null +++ b/docs/_posts/Cabir40/2022-11-22-ner_clinical_bert_ro.md @@ -0,0 +1,187 @@ +--- +layout: model +title: Detect Clinical Entities in Romanian (Bert, Base, Cased) +author: John Snow Labs +name: ner_clinical_bert +date: 2022-11-22 +tags: [licensed, clinical, ro, ner, bert] +task: Named Entity Recognition +language: ro +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Extract clinical entities from Romanian clinical texts. This model is trained using `bert_base_cased` embeddings. + +## Predicted Entities + +`Measurements`, `Form`, `Symptom`, `Route`, `Procedure`, `Disease_Syndrome_Disorder`, `Score`, `Drug_Ingredient`, `Pulse`, `Frequency`, `Date`, `Body_Part`, `Drug_Brand_Name`, `Time`, `Direction`, `Medical_Device`, `Imaging_Technique`, `Test`, `Imaging_Findings`, `Imaging_Test`, `Test_Result`, `Weight`, `Clinical_Dept`, `Units` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/DEID_PHI_TEXT_MULTI){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/1.Clinical_Named_Entity_Recognition_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_clinical_bert_ro_4.2.2_3.0_1669124033852.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler()\ +.setInputCol("text")\ +.setOutputCol("document") + +sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx")\ +.setInputCols(["document"])\ +.setOutputCol("sentence") + +tokenizer = Tokenizer()\ +.setInputCols(["sentence"])\ +.setOutputCol("token") + +word_embeddings = BertEmbeddings.pretrained("bert_base_cased", "ro") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") + +clinical_ner = MedicalNerModel.pretrained("ner_clinical_bert","ro","clinical/models")\ +.setInputCols(["sentence","token","embeddings"])\ +.setOutputCol("ner") + +ner_converter = NerConverter()\ +.setInputCols(["sentence","token","ner"])\ +.setOutputCol("ner_chunk") + +nlpPipeline = Pipeline(stages=[ +documentAssembler, +sentenceDetector, +tokenizer, +word_embeddings, +clinical_ner, +ner_converter]) + +data = spark.createDataFrame([[""" Solicitare: Angio CT cardio-toracic Dg. de trimitere Atrezie de valva pulmonara. Hipoplazie VS. Atrezie VAV stang. Anastomoza Glenn. Sp. Tromboza la nivelul anastomozei. Trimis de: Sectia Clinica Cardiologie (dr. Sue T.) Procedura Aparat GE Revolution HD. Branula albastra montata la nivelul membrului superior drept. Scout. Se administreaza 30 ml Iomeron 350 cu flux 2.2 ml/s, urmate de 20 ml ser fiziologic cu acelasi flux. Se efectueaza o examinare angio-CT cardiotoracica cu achizitii secventiale prospective la o frecventa cardiaca medie de 100/min."""]]).toDF("text") + +result = nlpPipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") +.setInputCols(Array("document")) +.setOutputCol("sentence") + +val tokenizer = new Tokenizer() +.setInputCols(Array("sentence")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_base_cased", "ro") +.setInputCols(Array("sentence", "token")) +.setOutputCol("embeddings") + +val ner_model = MedicalNerModel.pretrained("ner_clinical_bert", "ro", "clinical/models") +.setInputCols(Array("sentence", "token", "embeddings")) +.setOutputCol("ner") + +val ner_converter = new NerConverter() +.setInputCols(Array("sentence", "token", "ner")) +.setOutputCol("ner_chunk") + +val pipeline = new PipelineModel().setStages(Array(document_assembler, +sentence_detector, +tokenizer, +embeddings, +ner_model, +ner_converter)) + +val data = Seq("""Solicitare: Angio CT cardio-toracic Dg. de trimitere Atrezie de valva pulmonara. Hipoplazie VS. Atrezie VAV stang. Anastomoza Glenn. Sp. Tromboza la nivelul anastomozei. Trimis de: Sectia Clinica Cardiologie (dr. Sue T.) Procedura Aparat GE Revolution HD. Branula albastra montata la nivelul membrului superior drept. Scout. Se administreaza 30 ml Iomeron 350 cu flux 2.2 ml/s, urmate de 20 ml ser fiziologic cu acelasi flux. Se efectueaza o examinare angio-CT cardiotoracica cu achizitii secventiale prospective la o frecventa cardiaca medie de 100/min.""").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ro.embed.clinical.bert.base_cased").predict(""" Solicitare: Angio CT cardio-toracic Dg. de trimitere Atrezie de valva pulmonara. Hipoplazie VS. Atrezie VAV stang. Anastomoza Glenn. Sp. Tromboza la nivelul anastomozei. Trimis de: Sectia Clinica Cardiologie (dr. Sue T.) Procedura Aparat GE Revolution HD. Branula albastra montata la nivelul membrului superior drept. Scout. Se administreaza 30 ml Iomeron 350 cu flux 2.2 ml/s, urmate de 20 ml ser fiziologic cu acelasi flux. Se efectueaza o examinare angio-CT cardiotoracica cu achizitii secventiale prospective la o frecventa cardiaca medie de 100/min.""") +``` +
+ +## Results + +```bash ++--------------------------+-------------------------+ +|chunks |entities | ++--------------------------+-------------------------+ +|Angio CT cardio-toracic |Imaging_Test | +|Atrezie |Disease_Syndrome_Disorder| +|valva pulmonara |Body_Part | +|Hipoplazie |Disease_Syndrome_Disorder| +|VS |Body_Part | +|Atrezie |Disease_Syndrome_Disorder| +|VAV stang |Body_Part | +|Anastomoza Glenn |Disease_Syndrome_Disorder| +|Tromboza |Disease_Syndrome_Disorder| +|Sectia Clinica Cardiologie|Clinical_Dept | +|GE Revolution HD |Medical_Device | +|Branula albastra |Medical_Device | +|membrului superior drept |Body_Part | +|Scout |Body_Part | +|30 ml |Dosage | +|Iomeron 350 |Drug_Ingredient | +|2.2 ml/s |Dosage | +|20 ml |Dosage | +|ser fiziologic |Drug_Ingredient | +|angio-CT |Imaging_Test | ++--------------------------+-------------------------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_clinical_bert| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|ro| +|Size:|16.3 MB| + +## Benchmarking + +```bash +label precision recall f1-score support +Body_Part 0.91 0.93 0.92 679 +Clinical_Dept 0.68 0.65 0.67 97 +Date 0.99 0.99 0.99 87 +Direction 0.66 0.76 0.70 50 +Disease_Syndrome_Disorder 0.73 0.76 0.74 121 +Dosage 0.78 1.00 0.87 38 +Drug_Ingredient 0.90 0.94 0.92 48 +Form 1.00 1.00 1.00 6 +Imaging_Findings 0.86 0.82 0.84 201 +Imaging_Technique 0.92 0.92 0.92 26 +Imaging_Test 0.93 0.98 0.95 205 +Measurements 0.71 0.69 0.70 214 +Medical_Device 0.85 0.81 0.83 42 +Pulse 0.82 1.00 0.90 9 +Route 1.00 0.91 0.95 33 +Score 1.00 0.98 0.99 41 +Time 1.00 1.00 1.00 28 +Units 0.60 0.93 0.73 88 +Weight 0.82 1.00 0.90 9 +micro-avg 0.84 0.87 0.86 2037 +macro-avg 0.70 0.74 0.72 2037 +weighted-avg 0.84 0.87 0.85 2037 +``` \ No newline at end of file diff --git a/docs/_posts/Cabir40/2022-11-22-ner_deid_generic_bert_ro.md b/docs/_posts/Cabir40/2022-11-22-ner_deid_generic_bert_ro.md new file mode 100644 index 00000000000000..993346e2da74b7 --- /dev/null +++ b/docs/_posts/Cabir40/2022-11-22-ner_deid_generic_bert_ro.md @@ -0,0 +1,173 @@ +--- +layout: model +title: Detect PHI for Generic Deidentification in Romanian (BERT) +author: John Snow Labs +name: ner_deid_generic_bert +date: 2022-11-22 +tags: [licensed, clinical, ro, deidentification, phi, generic, bert] +task: Named Entity Recognition +language: ro +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Named Entity Recognition annotators to allow a generic model to be trained by using a Deep Learning architecture (Char CNN's - BiLSTM - CRF - word embeddings) inspired by a former state-of-the-art model for NER: Chiu & Nicols, Named Entity Recognition with Bidirectional LSTM CNN. + +Deidentification NER (Romanian) is a Named Entity Recognition model that annotates text to find protected health information that may need to be de-identified. It is trained with bert_base_cased embeddings and can detect 7 generic entities. + +This NER model is trained with a combination of custom datasets with several data augmentation mechanisms. + +## Predicted Entities + +`AGE`, `CONTACT`, `DATE`, `ID`, `LOCATION`, `NAME`, `PROFESSION` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/DEID_PHI_TEXT_MULTI/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/healthcare/DEID_PHI_TEXT_MULTI.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_deid_generic_bert_ro_4.2.2_3.0_1669122326582.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentenceDetector = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_base_cased", "ro")\ + .setInputCols(["sentence","token"])\ + .setOutputCol("word_embeddings") + +clinical_ner = MedicalNerModel.pretrained("ner_deid_generic_bert", "ro", "clinical/models")\ + .setInputCols(["sentence","token","word_embeddings"])\ + .setOutputCol("ner") + +ner_converter = NerConverter()\ + .setInputCols(["sentence", "token", "ner"])\ + .setOutputCol("ner_chunk") + +nlpPipeline = Pipeline(stages=[documentAssembler, sentenceDetector, tokenizer, embeddings, clinical_ner, ner_converter]) + +text = """ +Spitalul Pentru Ochi de Deal, Drumul Oprea Nr. 972 Vaslui, 737405 România +Tel: +40(235)413773 +Data setului de analize: 25 May 2022 15:36:00 +Nume si Prenume : BUREAN MARIA, Varsta: 77 +Medic : Agota Evelyn Tımar +C.N.P : 2450502264401""" + +data = spark.createDataFrame([[text]]).toDF("text") + +results = nlpPipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentenceDetector = new SentenceDetector() + .setInputCols(Array("document")) + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols(Array("sentence")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_base_cased", "ro") + .setInputCols(Array("sentence","token")) + .setOutputCol("word_embeddings") + +val clinical_ner = MedicalNerModel.pretrained("ner_deid_generic_bert", "ro", "clinical/models") + .setInputCols(Array("sentence","token","word_embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, sentenceDetector, tokenizer, embeddings, clinical_ner, ner_converter)) + +val text = """Spitalul Pentru Ochi de Deal, Drumul Oprea Nr. 972 Vaslui, 737405 România +Tel: +40(235)413773 +Data setului de analize: 25 May 2022 15:36:00 +Nume si Prenume : BUREAN MARIA, Varsta: 77 +Medic : Agota Evelyn Tımar +C.N.P : 2450502264401""" + +val data = Seq(text).toDS.toDF("text") + +val results = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++----------------------------+---------+ +|chunk |ner_label| ++----------------------------+---------+ +|Spitalul Pentru Ochi de Deal|LOCATION | +|Drumul Oprea Nr |LOCATION | +|972 |LOCATION | +|Vaslui |LOCATION | +|737405 |LOCATION | +|+40(235)413773 |CONTACT | +|25 May 2022 |DATE | +|BUREAN MARIA |NAME | +|77 |AGE | +|Agota Evelyn Tımar |NAME | +|2450502264401 |ID | ++----------------------------+---------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_deid_generic_bert| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|ro| +|Size:|16.5 MB| + +## References + +- Custom John Snow Labs datasets +- Data augmentation techniques + +## Benchmarking + +```bash + label precision recall f1-score support + AGE 0.95 0.97 0.96 1186 + CONTACT 0.99 0.98 0.98 366 + DATE 0.96 0.92 0.94 4518 + ID 1.00 1.00 1.00 679 + LOCATION 0.91 0.90 0.90 1683 + NAME 0.93 0.96 0.94 2916 + PROFESSION 0.87 0.85 0.86 161 + micro-avg 0.94 0.94 0.94 11509 + macro-avg 0.94 0.94 0.94 11509 +weighted-avg 0.95 0.94 0.94 11509 +``` \ No newline at end of file diff --git a/docs/_posts/Cabir40/2022-11-22-ner_living_species_300_es.md b/docs/_posts/Cabir40/2022-11-22-ner_living_species_300_es.md new file mode 100644 index 00000000000000..3822b73d18f84e --- /dev/null +++ b/docs/_posts/Cabir40/2022-11-22-ner_living_species_300_es.md @@ -0,0 +1,166 @@ +--- +layout: model +title: Detect Living Species(embeddings_scielo_300d) +author: John Snow Labs +name: ner_living_species_300 +date: 2022-11-22 +tags: [licensed, clinical, es, ner] +task: Named Entity Recognition +language: es +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Extract living species from clinical texts in Spanish which is critical to scientific disciplines like medicine, biology, ecology/biodiversity, nutrition and agriculture. This model is trained using `embeddings_scielo_300d` embeddings. + +It is trained on the [LivingNER](https://temu.bsc.es/livingner/) corpus that is composed of clinical case reports extracted from miscellaneous medical specialties including COVID, oncology, infectious diseases, tropical medicine, urology, pediatrics, and others. + +## Predicted Entities + +`HUMAN`, `SPECIES` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/1.Clinical_Named_Entity_Recognition_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_living_species_300_es_4.2.2_3.0_1669127690723.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +embeddings = WordEmbeddingsModel.pretrained("embeddings_scielo_300d","es","clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") + +ner_model = MedicalNerModel.pretrained("ner_living_species_300", "es","clinical/models")\ + .setInputCols(["sentence", "token", "embeddings"])\ + .setOutputCol("ner") + +ner_converter = NerConverter()\ + .setInputCols(["sentence", "token", "ner"])\ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[ + document_assembler, + sentence_detector, + tokenizer, + embeddings, + ner_model, + ner_converter + ]) + +data = spark.createDataFrame([["""Lactante varón de dos años. Antecedentes familiares sin interés. Antecedentes personales: Embarazo, parto y periodo neonatal normal. En seguimiento por alergia a legumbres, diagnosticado con diez meses por reacción urticarial generalizada con lentejas y garbanzos, con dieta de exclusión a legumbres desde entonces. En ésta visita la madre describe episodios de eritema en zona maxilar derecha con afectación ocular ipsilateral que se resuelve en horas tras la administración de corticoides. Le ha ocurrido en 5-6 ocasiones, en relación con la ingesta de alimentos previamente tolerados. Exploración complementaria: Cacahuete, ac(ige)19.2 Ku.arb/l. Resultados: Ante la sospecha clínica de Síndrome de Frey, se tranquiliza a los padres, explicándoles la naturaleza del cuadro y se cita para revisión anual."""]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") + .setInputCols(Array("document")) + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols(Array("sentence")) + .setOutputCol("token") + +val embeddings = WordEmbeddingsModel.pretrained("embeddings_scielo_300d","es","clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner_model = MedicalNerModel.pretrained("ner_living_species_300", "es","clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pipeline = new PipelineModel().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + embeddings, + ner_model, + ner_converter)) + +val data = Seq("""Lactante varón de dos años. Antecedentes familiares sin interés. Antecedentes personales: Embarazo, parto y periodo neonatal normal. En seguimiento por alergia a legumbres, diagnosticado con diez meses por reacción urticarial generalizada con lentejas y garbanzos, con dieta de exclusión a legumbres desde entonces. En ésta visita la madre describe episodios de eritema en zona maxilar derecha con afectación ocular ipsilateral que se resuelve en horas tras la administración de corticoides. Le ha ocurrido en 5-6 ocasiones, en relación con la ingesta de alimentos previamente tolerados. Exploración complementaria: Cacahuete, ac(ige)19.2 Ku.arb/l. Resultados: Ante la sospecha clínica de Síndrome de Frey, se tranquiliza a los padres, explicándoles la naturaleza del cuadro y se cita para revisión anual.""").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.med_ner.living_species.300").predict("""Lactante varón de dos años. Antecedentes familiares sin interés. Antecedentes personales: Embarazo, parto y periodo neonatal normal. En seguimiento por alergia a legumbres, diagnosticado con diez meses por reacción urticarial generalizada con lentejas y garbanzos, con dieta de exclusión a legumbres desde entonces. En ésta visita la madre describe episodios de eritema en zona maxilar derecha con afectación ocular ipsilateral que se resuelve en horas tras la administración de corticoides. Le ha ocurrido en 5-6 ocasiones, en relación con la ingesta de alimentos previamente tolerados. Exploración complementaria: Cacahuete, ac(ige)19.2 Ku.arb/l. Resultados: Ante la sospecha clínica de Síndrome de Frey, se tranquiliza a los padres, explicándoles la naturaleza del cuadro y se cita para revisión anual.""") +``` +
+ +## Results + +```bash ++--------------+-------+ +|ner_chunk |label | ++--------------+-------+ +|Lactante varón|HUMAN | +|familiares |HUMAN | +|personales |HUMAN | +|neonatal |HUMAN | +|legumbres |SPECIES| +|lentejas |SPECIES| +|garbanzos |SPECIES| +|legumbres |SPECIES| +|madre |HUMAN | +|Cacahuete |SPECIES| +|padres |HUMAN | ++--------------+-------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_living_species_300| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|es| +|Size:|15.0 MB| + +## Benchmarking + +```bash +label precision recall f1-score support +B-HUMAN 0.98 0.97 0.98 3281 +B-SPECIES 0.94 0.98 0.96 3712 +I-HUMAN 0.87 0.81 0.84 297 +I-SPECIES 0.79 0.89 0.84 1732 +micro-avg 0.92 0.95 0.94 9022 +macro-avg 0.90 0.91 0.90 9022 +weighted-avg 0.93 0.95 0.94 9022 +``` \ No newline at end of file From 2e7641240f61e71ca79dc3ed222f5e65b206063a Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Fri, 25 Nov 2022 03:46:47 +0700 Subject: [PATCH 40/57] 2022-11-24-ner_oncology_anatomy_general_en (#13139) * Add model 2022-11-24-ner_oncology_anatomy_general_en * Add model 2022-11-24-ner_oncology_anatomy_granular_en * Add model 2022-11-24-ner_oncology_biomarker_en * Add model 2022-11-24-ner_oncology_demographics_en * Add model 2022-11-24-ner_oncology_diagnosis_en * Add model 2022-11-24-ner_oncology_en * Add model 2022-11-24-ner_oncology_posology_en * Add model 2022-11-24-ner_oncology_response_to_treatment_en * Add model 2022-11-24-ner_oncology_test_en * Add model 2022-11-24-ner_oncology_therapy_en * Add model 2022-11-24-ner_oncology_tnm_en * Add model 2022-11-24-ner_oncology_unspecific_posology_en Co-authored-by: Ahmetemintek --- ...2-11-24-ner_oncology_anatomy_general_en.md | 148 ++++++++++++ ...-11-24-ner_oncology_anatomy_granular_en.md | 155 +++++++++++++ .../2022-11-24-ner_oncology_biomarker_en.md | 163 +++++++++++++ ...2022-11-24-ner_oncology_demographics_en.md | 150 ++++++++++++ .../2022-11-24-ner_oncology_diagnosis_en.md | 163 +++++++++++++ .../2022-11-24-ner_oncology_en.md | 219 ++++++++++++++++++ .../2022-11-24-ner_oncology_posology_en.md | 161 +++++++++++++ ...4-ner_oncology_response_to_treatment_en.md | 147 ++++++++++++ .../2022-11-24-ner_oncology_test_en.md | 152 ++++++++++++ .../2022-11-24-ner_oncology_therapy_en.md | 175 ++++++++++++++ .../2022-11-24-ner_oncology_tnm_en.md | 156 +++++++++++++ ...-24-ner_oncology_unspecific_posology_en.md | 152 ++++++++++++ 12 files changed, 1941 insertions(+) create mode 100644 docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_anatomy_general_en.md create mode 100644 docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_anatomy_granular_en.md create mode 100644 docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_biomarker_en.md create mode 100644 docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_demographics_en.md create mode 100644 docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_diagnosis_en.md create mode 100644 docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_en.md create mode 100644 docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_posology_en.md create mode 100644 docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_response_to_treatment_en.md create mode 100644 docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_test_en.md create mode 100644 docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_therapy_en.md create mode 100644 docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_tnm_en.md create mode 100644 docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_unspecific_posology_en.md diff --git a/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_anatomy_general_en.md b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_anatomy_general_en.md new file mode 100644 index 00000000000000..571b330d1ba383 --- /dev/null +++ b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_anatomy_general_en.md @@ -0,0 +1,148 @@ +--- +layout: model +title: Extract Anatomical Entities from Oncology Texts +author: John Snow Labs +name: ner_oncology_anatomy_general +date: 2022-11-24 +tags: [licensed, clinical, en, oncology, anatomy, ner] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts anatomical entities using an unspecific label. + +## Predicted Entities + +`Anatomical_Site`, `Direction` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_anatomy_general_en_4.2.2_3.0_1669298930681.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_anatomy_general", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["The patient presented a mass in her left breast, and a possible metastasis in her lungs and in her liver."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_anatomy_general", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("The patient presented a mass in her left breast, and a possible metastasis in her lungs and in her liver.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:--------|:----------------| +| left | Direction | +| breast | Anatomical_Site | +| lungs | Anatomical_Site | +| liver | Anatomical_Site | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_anatomy_general| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.3 MB| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 +Anatomical_Site 2946 549 638 3584 0.84 0.82 0.83 + Direction 864 209 120 984 0.81 0.88 0.84 + macro_avg 3810 758 758 4568 0.82 0.85 0.84 + micro_avg 3810 758 758 4568 0.83 0.83 0.83 +``` \ No newline at end of file diff --git a/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_anatomy_granular_en.md b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_anatomy_granular_en.md new file mode 100644 index 00000000000000..54a16844d633be --- /dev/null +++ b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_anatomy_granular_en.md @@ -0,0 +1,155 @@ +--- +layout: model +title: Extract Granular Anatomical Entities from Oncology Texts +author: John Snow Labs +name: ner_oncology_anatomy_granular +date: 2022-11-24 +tags: [licensed, clinical, en, oncology, ner, anatomy] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts mentions of anatomical entities using granular labels. + +## Predicted Entities + +`Direction`, `Site_Lymph_Node`, `Site_Breast`, `Site_Other_Body_Part`, `Site_Bone`, `Site_Liver`, `Site_Lung`, `Site_Brain` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_anatomy_granular_en_4.2.2_3.0_1669299394344.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_anatomy_granular", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["The patient presented a mass in her left breast, and a possible metastasis in her lungs and in her liver."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_anatomy_granular", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("The patient presented a mass in her left breast, and a possible metastasis in her lungs and in her liver.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:--------|:------------| +| left | Direction | +| breast | Site_Breast | +| lungs | Site_Lung | +| liver | Site_Liver | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_anatomy_granular| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.3 MB| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + Direction 822 221 162 984 0.79 0.84 0.81 + Site_Lymph_Node 481 38 70 551 0.93 0.87 0.90 + Site_Breast 88 14 59 147 0.86 0.60 0.71 +Site_Other_Body_Part 604 184 897 1501 0.77 0.40 0.53 + Site_Bone 252 74 61 313 0.77 0.81 0.79 + Site_Liver 178 92 56 234 0.66 0.76 0.71 + Site_Lung 398 98 161 559 0.80 0.71 0.75 + Site_Brain 197 44 82 279 0.82 0.71 0.76 + macro_avg 3020 765 1548 4568 0.80 0.71 0.74 + micro_avg 3020 765 1548 4568 0.80 0.66 0.71 +``` \ No newline at end of file diff --git a/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_biomarker_en.md b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_biomarker_en.md new file mode 100644 index 00000000000000..624baf3434747f --- /dev/null +++ b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_biomarker_en.md @@ -0,0 +1,163 @@ +--- +layout: model +title: Extract Biomarkers and their Results +author: John Snow Labs +name: ner_oncology_biomarker +date: 2022-11-24 +tags: [licensed, clinical, en, ner, oncology, biomarker] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts mentions of biomarkers and biomarker results from oncology texts. + +## Predicted Entities + +`Biomarker_Result`, `Biomarker` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_biomarker_en_4.2.2_3.0_1669299787628.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_biomarker", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["The results of immunohistochemical examination showed that she tested negative for CK7, synaptophysin (Syn), chromogranin A (CgA), Muc5AC, human epidermal growth factor receptor-2 (HER2), and Muc6; positive for CK20, Muc1, Muc2, E-cadherin, and p53; the Ki-67 index was about 87%."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_biomarker", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("The results of immunohistochemical examination showed that she tested negative for CK7, synaptophysin (Syn), chromogranin A (CgA), Muc5AC, human epidermal growth factor receptor-2 (HER2), and Muc6; positive for CK20, Muc1, Muc2, E-cadherin, and p53; the Ki-67 index was about 87%.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:-----------------------------------------|:-----------------| +| negative | Biomarker_Result | +| CK7 | Biomarker | +| synaptophysin | Biomarker | +| Syn | Biomarker | +| chromogranin A | Biomarker | +| CgA | Biomarker | +| Muc5AC | Biomarker | +| human epidermal growth factor receptor-2 | Biomarker | +| HER2 | Biomarker | +| Muc6 | Biomarker | +| positive | Biomarker_Result | +| CK20 | Biomarker | +| Muc1 | Biomarker | +| Muc2 | Biomarker | +| E-cadherin | Biomarker | +| p53 | Biomarker | +| Ki-67 index | Biomarker | +| 87% | Biomarker_Result | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_biomarker| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.3 MB| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 +Biomarker_Result 1030 148 415 1445 0.87 0.71 0.79 + Biomarker 1685 272 279 1964 0.86 0.86 0.86 + macro_avg 2715 420 694 3409 0.87 0.79 0.82 + micro_avg 2715 420 694 3409 0.87 0.80 0.83 +``` \ No newline at end of file diff --git a/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_demographics_en.md b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_demographics_en.md new file mode 100644 index 00000000000000..9fa66e2ab8cf54 --- /dev/null +++ b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_demographics_en.md @@ -0,0 +1,150 @@ +--- +layout: model +title: Extract Demographic Entities from Oncology Texts +author: John Snow Labs +name: ner_oncology_demographics +date: 2022-11-24 +tags: [licensed, clinical, en, ner, oncology, demographics] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts demographic information from oncology texts, including age, gender, and smoking status. + +## Predicted Entities + +`Smoking_Status`, `Age`, `Race_Ethnicity`, `Gender` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_demographics_en_4.2.2_3.0_1669300163954.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_demographics", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["The patient is a 40-year-old man with history of heavy smoking."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_demographics", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("The patient is a 40-year-old man with history of heavy smoking.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:------------|:---------------| +| 40-year-old | Age | +| man | Gender | +| smoking | Smoking_Status | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_demographics| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.6 MB| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 +Smoking_Status 60 19 8 68 0.76 0.88 0.82 + Age 934 33 15 949 0.97 0.98 0.97 +Race_Ethnicity 57 5 5 62 0.92 0.92 0.92 + Gender 1248 18 6 1254 0.99 1.00 0.99 + macro_avg 2299 75 34 2333 0.91 0.95 0.93 + micro_avg 2299 75 34 2333 0.97 0.99 0.98 +``` \ No newline at end of file diff --git a/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_diagnosis_en.md b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_diagnosis_en.md new file mode 100644 index 00000000000000..0181422deb71bd --- /dev/null +++ b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_diagnosis_en.md @@ -0,0 +1,163 @@ +--- +layout: model +title: Detect Entities Related to Cancer Diagnosis +author: John Snow Labs +name: ner_oncology_diagnosis +date: 2022-11-24 +tags: [licensed, clinical, en, ner, oncology] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts entities related to cancer diagnosis, such as Metastasis, Histological_Type or Invasion. + +## Predicted Entities + +`Histological_Type`, `Staging`, `Cancer_Score`, `Tumor_Finding`, `Invasion`, `Tumor_Size`, `Adenopathy`, `Performance_Status`, `Pathology_Result`, `Metastasis`, `Cancer_Dx`, `Grade` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_diagnosis_en_4.2.2_3.0_1669300474926.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_diagnosis", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["Two years ago, the patient presented with a tumor in her left breast and adenopathies. She was diagnosed with invasive ductal carcinoma. +Last week she was also found to have a lung metastasis."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_diagnosis", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("Two years ago, the patient presented with a tumor in her left breast and adenopathies. She was diagnosed with invasive ductal carcinoma. +Last week she was also found to have a lung metastasis.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:-------------|:------------------| +| tumor | Tumor_Finding | +| adenopathies | Adenopathy | +| invasive | Histological_Type | +| ductal | Histological_Type | +| carcinoma | Cancer_Dx | +| metastasis | Metastasis | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_diagnosis| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.3 MB| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + Histological_Type 354 63 99 453 0.85 0.78 0.81 + Staging 234 27 24 258 0.90 0.91 0.90 + Cancer_Score 36 15 26 62 0.71 0.58 0.64 + Tumor_Finding 1121 83 136 1257 0.93 0.89 0.91 + Invasion 154 27 27 181 0.85 0.85 0.85 + Tumor_Size 1058 126 71 1129 0.89 0.94 0.91 + Adenopathy 66 10 30 96 0.87 0.69 0.77 +Performance_Status 116 15 19 135 0.89 0.86 0.87 + Pathology_Result 852 686 290 1142 0.55 0.75 0.64 + Metastasis 356 15 14 370 0.96 0.96 0.96 + Cancer_Dx 1302 88 92 1394 0.94 0.93 0.94 + Grade 201 23 35 236 0.90 0.85 0.87 + macro_avg 5850 1178 863 6713 0.85 0.83 0.84 + micro_avg 5850 1178 863 6713 0.85 0.87 0.86 +``` \ No newline at end of file diff --git a/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_en.md b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_en.md new file mode 100644 index 00000000000000..2577ea98f0f58f --- /dev/null +++ b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_en.md @@ -0,0 +1,219 @@ +--- +layout: model +title: Detect Oncology-Specific Entities +author: John Snow Labs +name: ner_oncology +date: 2022-11-24 +tags: [licensed, clinical, en, oncology, biomarker, treatment] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts more than 40 oncology-related entities, including therapies, tests and staging. + +## Predicted Entities + +`Histological_Type`, `Direction`, `Staging`, `Cancer_Score`, `Imaging_Test`, `Cycle_Number`, `Tumor_Finding`, `Site_Lymph_Node`, `Invasion`, `Response_To_Treatment`, `Smoking_Status`, `Tumor_Size`, `Cycle_Count`, `Adenopathy`, `Age`, `Biomarker_Result`, `Unspecific_Therapy`, `Site_Breast`, `Chemotherapy`, `Targeted_Therapy`, `Radiotherapy`, `Performance_Status`, `Pathology_Test`, `Site_Other_Body_Part`, `Cancer_Surgery`, `Line_Of_Therapy`, `Pathology_Result`, `Hormonal_Therapy`, `Site_Bone`, `Biomarker`, `Immunotherapy`, `Cycle_Day`, `Frequency`, `Route`, `Duration`, `Death_Entity`, `Metastasis`, `Site_Liver`, `Cancer_Dx`, `Grade`, `Date`, `Site_Lung`, `Site_Brain`, `Relative_Date`, `Race_Ethnicity`, `Gender`, `Oncogene`, `Dosage`, `Radiation_Dose` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_en_4.2.2_3.0_1669306355829.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["The had previously undergone a left mastectomy and an axillary lymph node dissection for a left breast cancer twenty years ago. +The tumor was positive for ER and PR. Postoperatively, radiotherapy was administered to the residual breast. +The cancer recurred as a right lung metastasis 13 years later. The patient underwent a regimen consisting of adriamycin (60 mg/m2) and cyclophosphamide (600 mg/m2) over six courses, as first line therapy."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("The had previously undergone a left mastectomy and an axillary lymph node dissection for a left breast cancer twenty years ago. +The tumor was positive for ER and PR. Postoperatively, radiotherapy was administered to the residual breast. +The cancer recurred as a right lung metastasis 13 years later. The patient underwent a regimen consisting of adriamycin (60 mg/m2) and cyclophosphamide (600 mg/m2) over six courses, as first line therapy.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:-------------------------------|:----------------------| +| left | Direction | +| mastectomy | Cancer_Surgery | +| axillary lymph node dissection | Cancer_Surgery | +| left | Direction | +| breast cancer | Cancer_Dx | +| twenty years ago | Relative_Date | +| tumor | Tumor_Finding | +| positive | Biomarker_Result | +| ER | Biomarker | +| PR | Biomarker | +| radiotherapy | Radiotherapy | +| breast | Site_Breast | +| cancer | Cancer_Dx | +| recurred | Response_To_Treatment | +| right | Direction | +| lung | Site_Lung | +| metastasis | Metastasis | +| 13 years later | Relative_Date | +| adriamycin | Chemotherapy | +| 60 mg/m2 | Dosage | +| cyclophosphamide | Chemotherapy | +| 600 mg/m2 | Dosage | +| six courses | Cycle_Count | +| first line | Line_Of_Therapy | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.6 MB| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + Histological_Type 339 75 114 453 0.82 0.75 0.78 + Direction 832 163 152 984 0.84 0.85 0.84 + Staging 229 31 29 258 0.88 0.89 0.88 + Cancer_Score 37 8 25 62 0.82 0.60 0.69 + Imaging_Test 2027 214 177 2204 0.90 0.92 0.91 + Cycle_Number 73 29 24 97 0.72 0.75 0.73 + Tumor_Finding 1114 64 143 1257 0.95 0.89 0.91 + Site_Lymph_Node 491 53 60 551 0.90 0.89 0.90 + Invasion 158 36 23 181 0.81 0.87 0.84 +Response_To_Treatment 431 149 165 596 0.74 0.72 0.73 + Smoking_Status 66 18 2 68 0.79 0.97 0.87 + Tumor_Size 1050 112 79 1129 0.90 0.93 0.92 + Cycle_Count 177 62 53 230 0.74 0.77 0.75 + Adenopathy 67 12 29 96 0.85 0.70 0.77 + Age 930 33 19 949 0.97 0.98 0.97 + Biomarker_Result 1160 169 285 1445 0.87 0.80 0.84 + Unspecific_Therapy 198 86 80 278 0.70 0.71 0.70 + Site_Breast 125 15 22 147 0.89 0.85 0.87 + Chemotherapy 814 55 65 879 0.94 0.93 0.93 + Targeted_Therapy 195 27 33 228 0.88 0.86 0.87 + Radiotherapy 276 29 34 310 0.90 0.89 0.90 + Performance_Status 121 17 14 135 0.88 0.90 0.89 + Pathology_Test 888 296 162 1050 0.75 0.85 0.79 + Site_Other_Body_Part 909 275 592 1501 0.77 0.61 0.68 + Cancer_Surgery 693 119 126 819 0.85 0.85 0.85 + Line_Of_Therapy 101 11 5 106 0.90 0.95 0.93 + Pathology_Result 655 279 487 1142 0.70 0.57 0.63 + Hormonal_Therapy 169 4 16 185 0.98 0.91 0.94 + Site_Bone 264 81 49 313 0.77 0.84 0.80 + Biomarker 1259 238 256 1515 0.84 0.83 0.84 + Immunotherapy 103 47 25 128 0.69 0.80 0.74 + Cycle_Day 200 36 48 248 0.85 0.81 0.83 + Frequency 354 27 73 427 0.93 0.83 0.88 + Route 91 15 22 113 0.86 0.81 0.83 + Duration 625 161 136 761 0.80 0.82 0.81 + Death_Entity 34 2 4 38 0.94 0.89 0.92 + Metastasis 353 18 17 370 0.95 0.95 0.95 + Site_Liver 189 64 45 234 0.75 0.81 0.78 + Cancer_Dx 1301 103 93 1394 0.93 0.93 0.93 + Grade 190 27 46 236 0.88 0.81 0.84 + Date 807 21 24 831 0.97 0.97 0.97 + Site_Lung 469 110 90 559 0.81 0.84 0.82 + Site_Brain 221 64 58 279 0.78 0.79 0.78 + Relative_Date 1211 401 111 1322 0.75 0.92 0.83 + Race_Ethnicity 57 8 5 62 0.88 0.92 0.90 + Gender 1247 17 7 1254 0.99 0.99 0.99 + Oncogene 345 83 104 449 0.81 0.77 0.79 + Dosage 900 30 160 1060 0.97 0.85 0.90 + Radiation_Dose 108 5 18 126 0.96 0.86 0.90 + macro_avg 24653 3999 4406 29059 0.85 0.84 0.84 + micro_avg 24653 3999 4406 29059 0.86 0.85 0.85 +``` \ No newline at end of file diff --git a/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_posology_en.md b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_posology_en.md new file mode 100644 index 00000000000000..5cf4bb2eb160c1 --- /dev/null +++ b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_posology_en.md @@ -0,0 +1,161 @@ +--- +layout: model +title: Extract Cancer Therapies and Granular Posology Information +author: John Snow Labs +name: ner_oncology_posology +date: 2022-11-24 +tags: [licensed, clinical, en, oncology, ner, treatment, posology] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts cancer therapies (Cancer_Surgery, Radiotherapy and Cancer_Therapy) and posology information at a granular level. + +## Predicted Entities + +`Cycle_Number`, `Cycle_Count`, `Radiotherapy`, `Cancer_Surgery`, `Cycle_Day`, `Frequency`, `Route`, `Cancer_Therapy`, `Duration`, `Dosage`, `Radiation_Dose` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_posology_en_4.2.2_3.0_1669306988706.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_posology", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["The patient underwent a regimen consisting of adriamycin (60 mg/m2) and cyclophosphamide (600 mg/m2) over six courses. She is currently receiving his second cycle of chemotherapy and is in good overall condition."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_posology", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("The patient underwent a regimen consisting of adriamycin (60 mg/m2) and cyclophosphamide (600 mg/m2) over six courses. She is currently receiving his second cycle of chemotherapy and is in good overall condition.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:-----------------|:---------------| +| adriamycin | Cancer_Therapy | +| 60 mg/m2 | Dosage | +| cyclophosphamide | Cancer_Therapy | +| 600 mg/m2 | Dosage | +| six courses | Cycle_Count | +| second cycle | Cycle_Number | +| chemotherapy | Cancer_Therapy | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_posology| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.3 MB| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + Cycle_Number 52 4 45 97 0.93 0.54 0.68 + Cycle_Count 200 63 30 230 0.76 0.87 0.81 + Radiotherapy 255 16 55 310 0.94 0.82 0.88 +Cancer_Surgery 592 66 227 819 0.90 0.72 0.80 + Cycle_Day 175 22 73 248 0.89 0.71 0.79 + Frequency 337 44 90 427 0.88 0.79 0.83 + Route 53 1 60 113 0.98 0.47 0.63 +Cancer_Therapy 1448 81 250 1698 0.95 0.85 0.90 + Duration 525 154 236 761 0.77 0.69 0.73 + Dosage 858 79 202 1060 0.92 0.81 0.86 +Radiation_Dose 86 4 40 126 0.96 0.68 0.80 + macro_avg 4581 534 1308 5889 0.90 0.72 0.79 + micro_avg 4581 534 1308 5889 0.90 0.78 0.83 +``` \ No newline at end of file diff --git a/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_response_to_treatment_en.md b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_response_to_treatment_en.md new file mode 100644 index 00000000000000..f119de14a2fd47 --- /dev/null +++ b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_response_to_treatment_en.md @@ -0,0 +1,147 @@ +--- +layout: model +title: Extract Mentions of Response to Cancer Treatment +author: John Snow Labs +name: ner_oncology_response_to_treatment +date: 2022-11-24 +tags: [licensed, clinical, en, oncology, ner, treatment] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts entities related to the patient”s response to the oncology treatment, including clinical response and changes in tumor size. + +## Predicted Entities + +`Response_To_Treatment`, `Size_Trend`, `Line_Of_Therapy` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_response_to_treatment_en_4.2.2_3.0_1669307329775.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_response_to_treatment", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["She completed her first-line therapy, but some months later there was recurrence of the breast cancer. "]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_response_to_treatment", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("She completed her first-line therapy, but some months later there was recurrence of the breast cancer. ").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:-----------|:----------------------| +| first-line | Line_Of_Therapy | +| recurrence | Response_To_Treatment | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_response_to_treatment| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.4 MB| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 +Response_To_Treatment 326 101 157 483 0.76 0.67 0.72 + Size_Trend 43 28 70 113 0.61 0.38 0.47 + Line_Of_Therapy 99 11 7 106 0.90 0.93 0.92 + macro_avg 468 140 234 702 0.76 0.66 0.70 + micro_avg 468 140 234 702 0.76 0.67 0.71 +``` \ No newline at end of file diff --git a/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_test_en.md b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_test_en.md new file mode 100644 index 00000000000000..65fb87ee0d655c --- /dev/null +++ b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_test_en.md @@ -0,0 +1,152 @@ +--- +layout: model +title: Extract Oncology Tests +author: John Snow Labs +name: ner_oncology_test +date: 2022-11-24 +tags: [licensed, clinical, oncology, en, ner, test] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts mentions of tests from oncology texts, including pathology tests and imaging tests. + +## Predicted Entities + +`Imaging_Test`, `Biomarker_Result`, `Pathology_Test`, `Biomarker`, `Oncogene` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_test_en_4.2.2_3.0_1669307746859.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_test", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["A biopsy was conducted using an ultrasound guided thick-needle. His chest computed tomography (CT) scan was negative."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_test", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("A biopsy was conducted using an ultrasound guided thick-needle. His chest computed tomography (CT) scan was negative.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:-------------------------------|:---------------| +| biopsy | Pathology_Test | +| ultrasound guided thick-needle | Pathology_Test | +| chest computed tomography | Imaging_Test | +| CT | Imaging_Test | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_test| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.2 MB| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + Imaging_Test 2020 229 184 2204 0.90 0.92 0.91 +Biomarker_Result 1177 186 268 1445 0.86 0.81 0.84 + Pathology_Test 888 276 162 1050 0.76 0.85 0.80 + Biomarker 1287 254 228 1515 0.84 0.85 0.84 + Oncogene 365 89 84 449 0.80 0.81 0.81 + macro_avg 5737 1034 926 6663 0.83 0.85 0.84 + micro_avg 5737 1034 926 6663 0.85 0.86 0.85 +``` \ No newline at end of file diff --git a/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_therapy_en.md b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_therapy_en.md new file mode 100644 index 00000000000000..f60dc17183f344 --- /dev/null +++ b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_therapy_en.md @@ -0,0 +1,175 @@ +--- +layout: model +title: Detect Entities Related to Cancer Therapies +author: John Snow Labs +name: ner_oncology_therapy +date: 2022-11-24 +tags: [clinical, en, licensed, oncology, treatment, ner] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts entities related to oncology therapies using granular labels, including mentions of treatments, posology information and line of therapy. + +## Predicted Entities + +`Cycle_Number`, `Response_To_Treatment`, `Cycle_Count`, `Unspecific_Therapy`, `Chemotherapy`, `Targeted_Therapy`, `Radiotherapy`, `Cancer_Surgery`, `Line_Of_Therapy`, `Hormonal_Therapy`, `Immunotherapy`, `Cycle_Day`, `Frequency`, `Route`, `Duration`, `Dosage`, `Radiation_Dose` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_therapy_en_4.2.2_3.0_1669308088671.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_therapy", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["The had previously undergone a left mastectomy and an axillary lymph node dissection for a left breast cancer twenty years ago. +The tumor was positive for ER and PR. Postoperatively, radiotherapy was administered to her breast. +The cancer recurred as a right lung metastasis 13 years later. The patient underwent a regimen consisting of adriamycin (60 mg/m2) and cyclophosphamide (600 mg/m2) over six courses, as first line therapy."]]).toDF("text") + +result = pipeline.fit(data).transform(data) + +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_therapy", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("The had previously undergone a left mastectomy and an axillary lymph node dissection for a left breast cancer twenty years ago. +The tumor was positive for ER and PR. Postoperatively, radiotherapy was administered to her breast. +The cancer recurred as a right lung metastasis 13 years later. The patient underwent a regimen consisting of adriamycin (60 mg/m2) and cyclophosphamide (600 mg/m2) over six courses, as first line therapy.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:-------------------------------|:----------------------| +| mastectomy | Cancer_Surgery | +| axillary lymph node dissection | Cancer_Surgery | +| radiotherapy | Radiotherapy | +| recurred | Response_To_Treatment | +| adriamycin | Chemotherapy | +| 60 mg/m2 | Dosage | +| cyclophosphamide | Chemotherapy | +| 600 mg/m2 | Dosage | +| six courses | Cycle_Count | +| first line | Line_Of_Therapy | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_therapy| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.4 MB| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + Cycle_Number 78 41 19 97 0.66 0.80 0.72 +Response_To_Treatment 451 205 145 596 0.69 0.76 0.72 + Cycle_Count 210 75 20 230 0.74 0.91 0.82 + Unspecific_Therapy 189 76 89 278 0.71 0.68 0.70 + Chemotherapy 831 87 48 879 0.91 0.95 0.92 + Targeted_Therapy 194 28 34 228 0.87 0.85 0.86 + Radiotherapy 279 35 31 310 0.89 0.90 0.89 + Cancer_Surgery 720 192 99 819 0.79 0.88 0.83 + Line_Of_Therapy 95 6 11 106 0.94 0.90 0.92 + Hormonal_Therapy 170 6 15 185 0.97 0.92 0.94 + Immunotherapy 96 17 32 128 0.85 0.75 0.80 + Cycle_Day 205 38 43 248 0.84 0.83 0.84 + Frequency 363 33 64 427 0.92 0.85 0.88 + Route 93 6 20 113 0.94 0.82 0.88 + Duration 527 102 234 761 0.84 0.69 0.76 + Dosage 959 63 101 1060 0.94 0.90 0.92 + Radiation_Dose 106 12 20 126 0.90 0.84 0.87 + macro_avg 5566 1022 1025 6591 0.85 0.84 0.84 + micro_avg 5566 1022 1025 6591 0.85 0.84 0.84 +``` \ No newline at end of file diff --git a/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_tnm_en.md b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_tnm_en.md new file mode 100644 index 00000000000000..3c68a8869d1751 --- /dev/null +++ b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_tnm_en.md @@ -0,0 +1,156 @@ +--- +layout: model +title: Extract Entities Related to TNM Staging +author: John Snow Labs +name: ner_oncology_tnm +date: 2022-11-24 +tags: [licensed, en, clinical, oncology, ner] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts staging information and mentions related to tumors, lymph nodes and metastases. + +## Predicted Entities + +`Lymph_Node`, `Staging`, `Lymph_Node_Modifier`, `Tumor_Description`, `Tumor`, `Metastasis`, `Cancer_Dx` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_tnm_en_4.2.2_3.0_1669308699155.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_tnm", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["The final diagnosis was metastatic breast carcinoma, and it was classified as T2N1M1 stage IV. The histological grade of this 4 cm tumor was grade 2."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_tnm", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("The final diagnosis was metastatic breast carcinoma, and it was classified as T2N1M1 stage IV. The histological grade of this 4 cm tumor was grade 2.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:-----------------|:------------------| +| metastatic | Metastasis | +| breast carcinoma | Cancer_Dx | +| T2N1M1 stage IV | Staging | +| 4 cm | Tumor_Description | +| tumor | Tumor | +| grade 2 | Tumor_Description | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_tnm| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.2 MB| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + Lymph_Node 570 77 77 647 0.88 0.88 0.88 + Staging 232 22 26 258 0.91 0.90 0.91 +Lymph_Node_Modifier 30 5 5 35 0.86 0.86 0.86 + Tumor_Description 2651 581 490 3141 0.82 0.84 0.83 + Tumor 1116 72 141 1257 0.94 0.89 0.91 + Metastasis 358 15 12 370 0.96 0.97 0.96 + Cancer_Dx 1302 87 92 1394 0.94 0.93 0.94 + macro_avg 6259 859 843 7102 0.90 0.90 0.90 + micro_avg 6259 859 843 7102 0.88 0.88 0.88 +``` \ No newline at end of file diff --git a/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_unspecific_posology_en.md b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_unspecific_posology_en.md new file mode 100644 index 00000000000000..70b089ad9fc003 --- /dev/null +++ b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_unspecific_posology_en.md @@ -0,0 +1,152 @@ +--- +layout: model +title: Extract Cancer Therapies and Posology Information +author: John Snow Labs +name: ner_oncology_unspecific_posology +date: 2022-11-24 +tags: [licensed, clinical, oncology, en, ner, treatment, posology] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts mentions of treatments and posology information using unspecific labels (low granularity). + +## Predicted Entities + +`Posology_Information`, `Cancer_Therapy` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_unspecific_posology_en_4.2.2_3.0_1669309081671.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_unspecific_posology", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["The patient underwent a regimen consisting of adriamycin (60 mg/m2) and cyclophosphamide (600 mg/m2) over six courses. She is currently receiving his second cycle of chemotherapy and is in good overall condition."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_unspecific_posology", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("The patient underwent a regimen consisting of adriamycin (60 mg/m2) and cyclophosphamide (600 mg/m2) over six courses. She is currently receiving his second cycle of chemotherapy and is in good overall condition.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:-----------------|:---------------------| +| adriamycin | Cancer_Therapy | +| 60 mg/m2 | Posology_Information | +| cyclophosphamide | Cancer_Therapy | +| 600 mg/m2 | Posology_Information | +| over six courses | Posology_Information | +| second cycle | Posology_Information | +| chemotherapy | Cancer_Therapy | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_unspecific_posology| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.3 MB| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 +Posology_Information 2663 244 399 3062 0.92 0.87 0.89 + Cancer_Therapy 2580 317 247 2827 0.89 0.91 0.90 + macro_avg 5243 561 646 5889 0.90 0.89 0.90 + micro_avg 5243 561 646 5889 0.90 0.89 0.90 +``` \ No newline at end of file From 9884ae8453a36bcf3ede767c1bccff84d6fc4a7d Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Tue, 6 Dec 2022 21:53:28 +0700 Subject: [PATCH 41/57] 2022-12-01-oncology_general_pipeline_en (#13178) * Add model 2022-12-01-oncology_general_pipeline_en * Add model 2022-12-01-oncology_diagnosis_pipeline_en * Add model 2022-12-01-oncology_biomarker_pipeline_en * Add model 2022-12-01-oncology_therapy_pipeline_en Co-authored-by: mauro-nievoff --- ...22-12-01-oncology_biomarker_pipeline_en.md | 214 ++++++++++++++++++ ...22-12-01-oncology_diagnosis_pipeline_en.md | 190 ++++++++++++++++ ...2022-12-01-oncology_general_pipeline_en.md | 172 ++++++++++++++ ...2022-12-01-oncology_therapy_pipeline_en.md | 141 ++++++++++++ 4 files changed, 717 insertions(+) create mode 100644 docs/_posts/mauro-nievoff/2022-12-01-oncology_biomarker_pipeline_en.md create mode 100644 docs/_posts/mauro-nievoff/2022-12-01-oncology_diagnosis_pipeline_en.md create mode 100644 docs/_posts/mauro-nievoff/2022-12-01-oncology_general_pipeline_en.md create mode 100644 docs/_posts/mauro-nievoff/2022-12-01-oncology_therapy_pipeline_en.md diff --git a/docs/_posts/mauro-nievoff/2022-12-01-oncology_biomarker_pipeline_en.md b/docs/_posts/mauro-nievoff/2022-12-01-oncology_biomarker_pipeline_en.md new file mode 100644 index 00000000000000..aa69e3272d1019 --- /dev/null +++ b/docs/_posts/mauro-nievoff/2022-12-01-oncology_biomarker_pipeline_en.md @@ -0,0 +1,214 @@ +--- +layout: model +title: Oncology Pipeline for Biomarkers +author: John Snow Labs +name: oncology_biomarker_pipeline +date: 2022-12-01 +tags: [licensed, pipeline, oncology, biomarker, en] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This pipeline includes Named-Entity Recognition, Assertion Status and Relation Extraction models to extract information from oncology texts. This pipeline focuses on entities related to biomarkers. + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/oncology_biomarker_pipeline_en_4.2.2_3.0_1669902355525.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +from sparknlp.pretrained import PretrainedPipeline + +pipeline = PretrainedPipeline("oncology_biomarker_pipeline", "en", "clinical/models") + +pipeline.annotate("Immunohistochemistry was negative for thyroid transcription factor-1 and napsin A. The test was positive for ER and PR, and negative for HER2.") +``` +```scala +import com.johnsnowlabs.nlp.pretrained.PretrainedPipeline + +val pipeline = new PretrainedPipeline("oncology_biomarker_pipeline", "en", "clinical/models") + +val result = pipeline.fullAnnotate("""Immunohistochemistry was negative for thyroid transcription factor-1 and napsin A. The test was positive for ER and PR, and negative for HER2.""")(0) +``` +
+ +## Results + +```bash +******************** ner_oncology_wip results ******************** + +| chunk | ner_label | +|:-------------------------------|:-----------------| +| negative | Biomarker_Result | +| thyroid transcription factor-1 | Biomarker | +| napsin | Biomarker | +| positive | Biomarker_Result | +| ER | Biomarker | +| PR | Biomarker | +| negative | Biomarker_Result | +| HER2 | Oncogene | + + +******************** ner_oncology_biomarker_wip results ******************** + +| chunk | ner_label | +|:-------------------------------|:-----------------| +| negative | Biomarker_Result | +| thyroid transcription factor-1 | Biomarker | +| napsin A | Biomarker | +| positive | Biomarker_Result | +| ER | Biomarker | +| PR | Biomarker | +| negative | Biomarker_Result | +| HER2 | Biomarker | + + +******************** ner_oncology_test_wip results ******************** + +| chunk | ner_label | +|:-------------------------------|:-----------------| +| Immunohistochemistry | Pathology_Test | +| negative | Biomarker_Result | +| thyroid transcription factor-1 | Biomarker | +| napsin A | Biomarker | +| positive | Biomarker_Result | +| ER | Biomarker | +| PR | Biomarker | +| negative | Biomarker_Result | +| HER2 | Oncogene | + + +******************** ner_biomarker results ******************** + +| chunk | ner_label | +|:-------------------------------|:----------------------| +| Immunohistochemistry | Test | +| negative | Biomarker_Measurement | +| thyroid transcription factor-1 | Biomarker | +| napsin A | Biomarker | +| positive | Biomarker_Measurement | +| ER | Biomarker | +| PR | Biomarker | +| negative | Biomarker_Measurement | +| HER2 | Biomarker | + + +******************** assertion_oncology_wip results ******************** + +| chunk | ner_label | assertion | +|:-------------------------------|:---------------|:------------| +| Immunohistochemistry | Pathology_Test | Past | +| thyroid transcription factor-1 | Biomarker | Present | +| napsin A | Biomarker | Present | +| ER | Biomarker | Present | +| PR | Biomarker | Present | +| HER2 | Oncogene | Present | + + +******************** assertion_oncology_test_binary_wip results ******************** + +| chunk | ner_label | assertion | +|:-------------------------------|:---------------|:----------------| +| Immunohistochemistry | Pathology_Test | Medical_History | +| thyroid transcription factor-1 | Biomarker | Medical_History | +| napsin A | Biomarker | Medical_History | +| ER | Biomarker | Medical_History | +| PR | Biomarker | Medical_History | +| HER2 | Oncogene | Medical_History | + + +******************** re_oncology_wip results ******************** + +| chunk1 | entity1 | chunk2 | entity2 | relation | +|:---------------------|:-----------------|:-------------------------------|:-----------------|:--------------| +| Immunohistochemistry | Pathology_Test | negative | Biomarker_Result | O | +| negative | Biomarker_Result | thyroid transcription factor-1 | Biomarker | is_related_to | +| negative | Biomarker_Result | napsin A | Biomarker | is_related_to | +| positive | Biomarker_Result | ER | Biomarker | is_related_to | +| positive | Biomarker_Result | PR | Biomarker | is_related_to | +| positive | Biomarker_Result | HER2 | Oncogene | O | +| ER | Biomarker | negative | Biomarker_Result | O | +| PR | Biomarker | negative | Biomarker_Result | O | +| negative | Biomarker_Result | HER2 | Oncogene | is_related_to | + + +******************** re_oncology_granular_wip results ******************** + +| chunk1 | entity1 | chunk2 | entity2 | relation | +|:---------------------|:-----------------|:-------------------------------|:-----------------|:--------------| +| Immunohistochemistry | Pathology_Test | negative | Biomarker_Result | O | +| negative | Biomarker_Result | thyroid transcription factor-1 | Biomarker | is_finding_of | +| negative | Biomarker_Result | napsin A | Biomarker | is_finding_of | +| positive | Biomarker_Result | ER | Biomarker | is_finding_of | +| positive | Biomarker_Result | PR | Biomarker | is_finding_of | +| positive | Biomarker_Result | HER2 | Oncogene | is_finding_of | +| ER | Biomarker | negative | Biomarker_Result | O | +| PR | Biomarker | negative | Biomarker_Result | O | +| negative | Biomarker_Result | HER2 | Oncogene | is_finding_of | + + +******************** re_oncology_biomarker_result_wip results ******************** + +| chunk1 | entity1 | chunk2 | entity2 | relation | +|:---------------------|:-----------------|:-------------------------------|:-----------------|:--------------| +| Immunohistochemistry | Pathology_Test | negative | Biomarker_Result | is_finding_of | +| negative | Biomarker_Result | thyroid transcription factor-1 | Biomarker | is_finding_of | +| negative | Biomarker_Result | napsin A | Biomarker | is_finding_of | +| positive | Biomarker_Result | ER | Biomarker | is_finding_of | +| positive | Biomarker_Result | PR | Biomarker | is_finding_of | +| positive | Biomarker_Result | HER2 | Oncogene | O | +| ER | Biomarker | negative | Biomarker_Result | O | +| PR | Biomarker | negative | Biomarker_Result | O | +| negative | Biomarker_Result | HER2 | Oncogene | is_finding_of | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|oncology_biomarker_pipeline| +|Type:|pipeline| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|1.7 GB| + +## Included Models + +- DocumentAssembler +- SentenceDetectorDLModel +- TokenizerModel +- WordEmbeddingsModel +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- ChunkMergeModel +- ChunkMergeModel +- AssertionDLModel +- AssertionDLModel +- PerceptronModel +- DependencyParserModel +- RelationExtractionModel +- RelationExtractionModel +- RelationExtractionModel \ No newline at end of file diff --git a/docs/_posts/mauro-nievoff/2022-12-01-oncology_diagnosis_pipeline_en.md b/docs/_posts/mauro-nievoff/2022-12-01-oncology_diagnosis_pipeline_en.md new file mode 100644 index 00000000000000..81972391c2553d --- /dev/null +++ b/docs/_posts/mauro-nievoff/2022-12-01-oncology_diagnosis_pipeline_en.md @@ -0,0 +1,190 @@ +--- +layout: model +title: Oncology Pipeline for Diagnosis Entities +author: John Snow Labs +name: oncology_diagnosis_pipeline +date: 2022-12-01 +tags: [licensed, pipeline, oncology, en] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This pipeline includes Named-Entity Recognition, Assertion Status, Relation Extraction and Entity Resolution models to extract information from oncology texts. This pipeline focuses on entities related to oncological diagnosis. + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/oncology_diagnosis_pipeline_en_4.2.2_3.0_1669901190921.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +from sparknlp.pretrained import PretrainedPipeline + +pipeline = PretrainedPipeline("oncology_diagnosis_pipeline", "en", "clinical/models") + +pipeline.fullAnnotate("Two years ago, the patient presented with a 4-cm tumor in her left breast. She was diagnosed with ductal carcinoma. +According to her last CT, she has no lung metastases.")[0] + +``` +```scala +import com.johnsnowlabs.nlp.pretrained.PretrainedPipeline + +val pipeline = new PretrainedPipeline("oncology_diagnosis_pipeline", "en", "clinical/models") + +val result = pipeline.fullAnnotate("""Two years ago, the patient presented with a 4-cm tumor in her left breast. She was diagnosed with ductal carcinoma. +According to her last CT, she has no lung metastases.""")(0) +``` +
+ +## Results + +```bash +******************** ner_oncology_wip results ******************** + +| chunk | ner_label | +|:-----------|:------------------| +| 4-cm | Tumor_Size | +| tumor | Tumor_Finding | +| left | Direction | +| breast | Site_Breast | +| ductal | Histological_Type | +| carcinoma | Cancer_Dx | +| lung | Site_Lung | +| metastases | Metastasis | + + +******************** ner_oncology_diagnosis_wip results ******************** + +| chunk | ner_label | +|:-----------|:------------------| +| 4-cm | Tumor_Size | +| tumor | Tumor_Finding | +| ductal | Histological_Type | +| carcinoma | Cancer_Dx | +| metastases | Metastasis | + + +******************** ner_oncology_tnm_wip results ******************** + +| chunk | ner_label | +|:-----------|:------------------| +| 4-cm | Tumor_Description | +| tumor | Tumor | +| ductal | Tumor_Description | +| carcinoma | Cancer_Dx | +| metastases | Metastasis | + + +******************** assertion_oncology_wip results ******************** + +| chunk | ner_label | assertion | +|:-----------|:------------------|:------------| +| tumor | Tumor_Finding | Present | +| ductal | Histological_Type | Present | +| carcinoma | Cancer_Dx | Present | +| metastases | Metastasis | Absent | + + +******************** assertion_oncology_problem_wip results ******************** + +| chunk | ner_label | assertion | +|:-----------|:------------------|:-----------------------| +| tumor | Tumor_Finding | Medical_History | +| ductal | Histological_Type | Medical_History | +| carcinoma | Cancer_Dx | Medical_History | +| metastases | Metastasis | Hypothetical_Or_Absent | + + +******************** re_oncology_wip results ******************** + +| chunk1 | entity1 | chunk2 | entity2 | relation | +|:---------|:--------------|:-----------|:--------------|:--------------| +| 4-cm | Tumor_Size | tumor | Tumor_Finding | is_related_to | +| 4-cm | Tumor_Size | carcinoma | Cancer_Dx | O | +| tumor | Tumor_Finding | breast | Site_Breast | is_related_to | +| breast | Site_Breast | carcinoma | Cancer_Dx | O | +| lung | Site_Lung | metastases | Metastasis | is_related_to | + + +******************** re_oncology_granular_wip results ******************** + +| chunk1 | entity1 | chunk2 | entity2 | relation | +|:---------|:--------------|:-----------|:--------------|:---------------| +| 4-cm | Tumor_Size | tumor | Tumor_Finding | is_size_of | +| 4-cm | Tumor_Size | carcinoma | Cancer_Dx | O | +| tumor | Tumor_Finding | breast | Site_Breast | is_location_of | +| breast | Site_Breast | carcinoma | Cancer_Dx | O | +| lung | Site_Lung | metastases | Metastasis | is_location_of | + + +******************** re_oncology_size_wip results ******************** + +| chunk1 | entity1 | chunk2 | entity2 | relation | +|:---------|:-----------|:----------|:--------------|:-----------| +| 4-cm | Tumor_Size | tumor | Tumor_Finding | is_size_of | +| 4-cm | Tumor_Size | carcinoma | Cancer_Dx | O | + + +******************** ICD-O resolver results ******************** + +| chunk | ner_label | code | normalized_term | +|:-----------|:------------------|:-------|:------------------| +| tumor | Tumor_Finding | 8000/1 | tumor | +| breast | Site_Breast | C50 | breast | +| ductal | Histological_Type | 8500/2 | dcis | +| carcinoma | Cancer_Dx | 8010/3 | carcinoma | +| lung | Site_Lung | C34.9 | lung | +| metastases | Metastasis | 8000/6 | tumor, metastatic | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|oncology_diagnosis_pipeline| +|Type:|pipeline| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|2.3 GB| + +## Included Models + +- DocumentAssembler +- SentenceDetectorDLModel +- TokenizerModel +- WordEmbeddingsModel +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- ChunkMergeModel +- ChunkMergeModel +- AssertionDLModel +- AssertionDLModel +- PerceptronModel +- DependencyParserModel +- RelationExtractionModel +- RelationExtractionModel +- RelationExtractionModel +- ChunkMergeModel +- Chunk2Doc +- BertSentenceEmbeddings +- SentenceEntityResolverModel \ No newline at end of file diff --git a/docs/_posts/mauro-nievoff/2022-12-01-oncology_general_pipeline_en.md b/docs/_posts/mauro-nievoff/2022-12-01-oncology_general_pipeline_en.md new file mode 100644 index 00000000000000..6d0265813e8195 --- /dev/null +++ b/docs/_posts/mauro-nievoff/2022-12-01-oncology_general_pipeline_en.md @@ -0,0 +1,172 @@ +--- +layout: model +title: General Oncology Pipeline +author: John Snow Labs +name: oncology_general_pipeline +date: 2022-12-01 +tags: [licensed, pipeline, oncology, en] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This pipeline includes Named-Entity Recognition, Assertion Status and Relation Extraction models to extract information from oncology texts. This pipeline extracts diagnoses, treatments, tests, anatomical references and demographic entities. + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/oncology_general_pipeline_en_4.2.2_3.0_1669899456383.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +from sparknlp.pretrained import PretrainedPipeline + +pipeline = PretrainedPipeline("oncology_general_pipeline", "en", "clinical/models") + +pipeline.annotate("The patient underwent a left mastectomy for a left breast cancer two months ago. +The tumor is positive for ER and PR.") + +``` +```scala +import com.johnsnowlabs.nlp.pretrained.PretrainedPipeline + +val pipeline = new PretrainedPipeline("oncology_general_pipeline", "en", "clinical/models") + +val result = pipeline.fullAnnotate("""The patient underwent a left mastectomy for a left breast cancer two months ago. +The tumor is positive for ER and PR.""")(0) +``` +
+ +## Results + +```bash +******************** ner_oncology_wip results ******************** + +| chunk | ner_label | +|:---------------|:-----------------| +| left | Direction | +| mastectomy | Cancer_Surgery | +| left | Direction | +| breast cancer | Cancer_Dx | +| two months ago | Relative_Date | +| tumor | Tumor_Finding | +| positive | Biomarker_Result | +| ER | Biomarker | +| PR | Biomarker | + + +******************** ner_oncology_diagnosis_wip results ******************** + +| chunk | ner_label | +|:--------------|:--------------| +| breast cancer | Cancer_Dx | +| tumor | Tumor_Finding | + + +******************** ner_oncology_tnm_wip results ******************** + +| chunk | ner_label | +|:--------------|:------------| +| breast cancer | Cancer_Dx | +| tumor | Tumor | + + +******************** ner_oncology_therapy_wip results ******************** + +| chunk | ner_label | +|:-----------|:---------------| +| mastectomy | Cancer_Surgery | + + +******************** ner_oncology_test_wip results ******************** + +| chunk | ner_label | +|:---------|:-----------------| +| positive | Biomarker_Result | +| ER | Biomarker | +| PR | Biomarker | + + +******************** assertion_oncology_wip results ******************** + +| chunk | ner_label | assertion | +|:--------------|:---------------|:------------| +| mastectomy | Cancer_Surgery | Past | +| breast cancer | Cancer_Dx | Present | +| tumor | Tumor_Finding | Present | +| ER | Biomarker | Present | +| PR | Biomarker | Present | + + +******************** re_oncology_wip results ******************** + +| chunk1 | entity1 | chunk2 | entity2 | relation | +|:--------------|:-----------------|:---------------|:--------------|:--------------| +| mastectomy | Cancer_Surgery | two months ago | Relative_Date | is_related_to | +| breast cancer | Cancer_Dx | two months ago | Relative_Date | is_related_to | +| tumor | Tumor_Finding | ER | Biomarker | O | +| tumor | Tumor_Finding | PR | Biomarker | O | +| positive | Biomarker_Result | ER | Biomarker | is_related_to | +| positive | Biomarker_Result | PR | Biomarker | is_related_to | + + +******************** re_oncology_granular_wip results ******************** + +| chunk1 | entity1 | chunk2 | entity2 | relation | +|:--------------|:-----------------|:---------------|:--------------|:--------------| +| mastectomy | Cancer_Surgery | two months ago | Relative_Date | is_date_of | +| breast cancer | Cancer_Dx | two months ago | Relative_Date | is_date_of | +| tumor | Tumor_Finding | ER | Biomarker | O | +| tumor | Tumor_Finding | PR | Biomarker | O | +| positive | Biomarker_Result | ER | Biomarker | is_finding_of | +| positive | Biomarker_Result | PR | Biomarker | is_finding_of | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|oncology_general_pipeline| +|Type:|pipeline| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|1.7 GB| + +## Included Models + +- DocumentAssembler +- SentenceDetectorDLModel +- TokenizerModel +- WordEmbeddingsModel +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- ChunkMergeModel +- ChunkMergeModel +- AssertionDLModel +- PerceptronModel +- DependencyParserModel +- RelationExtractionModel +- RelationExtractionModel \ No newline at end of file diff --git a/docs/_posts/mauro-nievoff/2022-12-01-oncology_therapy_pipeline_en.md b/docs/_posts/mauro-nievoff/2022-12-01-oncology_therapy_pipeline_en.md new file mode 100644 index 00000000000000..d73d6e750ef661 --- /dev/null +++ b/docs/_posts/mauro-nievoff/2022-12-01-oncology_therapy_pipeline_en.md @@ -0,0 +1,141 @@ +--- +layout: model +title: Oncology Pipeline for Therapies +author: John Snow Labs +name: oncology_therapy_pipeline +date: 2022-12-01 +tags: [licensed, pipeline, oncology, en] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This pipeline includes Named-Entity Recognition and Assertion Status models to extract information from oncology texts. This pipeline focuses on entities related to therapies. + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/oncology_therapy_pipeline_en_4.2.2_3.0_1669906146446.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +from sparknlp.pretrained import PretrainedPipeline + +pipeline = PretrainedPipeline("oncology_therapy_pipeline", "en", "clinical/models") + +pipeline.fullAnnotate("The patient underwent a mastectomy two years ago. She is currently receiving her second cycle of adriamycin and cyclophosphamide, and is in good overall condition.")[0] + +``` +```scala +import com.johnsnowlabs.nlp.pretrained.PretrainedPipeline + +val pipeline = new PretrainedPipeline("oncology_therapy_pipeline", "en", "clinical/models") + +val result = pipeline.fullAnnotate("""The patient underwent a mastectomy two years ago. She is currently receiving her second cycle of adriamycin and cyclophosphamide, and is in good overall condition.""")(0) +``` +
+ +## Results + +```bash +******************** ner_oncology_wip results ******************** + +| chunk | ner_label | +|:-----------------|:---------------| +| mastectomy | Cancer_Surgery | +| second cycle | Cycle_Number | +| adriamycin | Chemotherapy | +| cyclophosphamide | Chemotherapy | + + +******************** ner_oncology_wip results ******************** + +| chunk | ner_label | +|:-----------------|:---------------| +| mastectomy | Cancer_Surgery | +| second cycle | Cycle_Number | +| adriamycin | Chemotherapy | +| cyclophosphamide | Chemotherapy | + + +******************** ner_oncology_wip results ******************** + +| chunk | ner_label | +|:-----------------|:---------------| +| mastectomy | Cancer_Surgery | +| second cycle | Cycle_Number | +| adriamycin | Cancer_Therapy | +| cyclophosphamide | Cancer_Therapy | + + +******************** ner_oncology_unspecific_posology_wip results ******************** + +| chunk | ner_label | +|:-----------------|:---------------------| +| mastectomy | Cancer_Therapy | +| second cycle | Posology_Information | +| adriamycin | Cancer_Therapy | +| cyclophosphamide | Cancer_Therapy | + + +******************** assertion_oncology_wip results ******************** + +| chunk | ner_label | assertion | +|:-----------------|:---------------|:------------| +| mastectomy | Cancer_Surgery | Past | +| adriamycin | Chemotherapy | Present | +| cyclophosphamide | Chemotherapy | Present | + + +******************** assertion_oncology_treatment_binary_wip results ******************** + +| chunk | ner_label | assertion | +|:-----------------|:---------------|:----------------| +| mastectomy | Cancer_Surgery | Present_Or_Past | +| adriamycin | Chemotherapy | Present_Or_Past | +| cyclophosphamide | Chemotherapy | Present_Or_Past | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|oncology_therapy_pipeline| +|Type:|pipeline| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|1.7 GB| + +## Included Models + +- DocumentAssembler +- SentenceDetectorDLModel +- TokenizerModel +- WordEmbeddingsModel +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- MedicalNerModel +- NerConverter +- ChunkMergeModel +- ChunkMergeModel +- AssertionDLModel +- AssertionDLModel \ No newline at end of file From 20e6f12f99e4be57439618c8ba06061f6784a8a1 Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Thu, 15 Dec 2022 19:56:15 +0700 Subject: [PATCH 42/57] Add model 2022-12-15-drug_category_mapper_en (#13230) Co-authored-by: Ahmetemintek --- .../2022-12-15-drug_category_mapper_en.md | 159 ++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 docs/_posts/Ahmetemintek/2022-12-15-drug_category_mapper_en.md diff --git a/docs/_posts/Ahmetemintek/2022-12-15-drug_category_mapper_en.md b/docs/_posts/Ahmetemintek/2022-12-15-drug_category_mapper_en.md new file mode 100644 index 00000000000000..09741907942e0c --- /dev/null +++ b/docs/_posts/Ahmetemintek/2022-12-15-drug_category_mapper_en.md @@ -0,0 +1,159 @@ +--- +layout: model +title: Mapping Drugs to Their Categories as well as Other Brand and Names +author: John Snow Labs +name: drug_category_mapper +date: 2022-12-15 +tags: [category, chunk_mapper, drug, licensed, clinical, en] +task: Chunk Mapping +language: en +edition: Healthcare NLP 4.2.3 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This pretrained model maps drugs to their categories and other brands and names. It has two categories called main category and subcategory. + +## Predicted Entities + +`main_category`, `sub_category`, `other_name` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/26.Chunk_Mapping.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/drug_category_mapper_en_4.2.3_3.0_1671100791411.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols("sentence")\ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_posology", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk")\ + +chunkerMapper = ChunkMapperModel.pretrained("drug_category_mapper", "en", "clinical/models")\ + .setInputCols(["ner_chunk"])\ + .setOutputCol("mappings")\ + .setRels(["main_category", "sub_category", "other_name"])\ + +pipeline = Pipeline().setStages([ + document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + converter, + chunkerMapper]) + + +text= "She is given OxyContin, folic acid, levothyroxine, Norvasc, aspirin, Neurontin" + +data = spark.createDataFrame([[text]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = new SentenceDetector() + .setInputCols(Array("document")) + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_posology", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val chunkerMapper = ChunkMapperModel.pretrained("drug_category_mapper", "en", "clinical/models") + .setInputCols("ner_chunk") + .setOutputCol("mappings") + .setRels(Array(["main_category", "sub_category", "other_name"])) + + +val pipeline = new Pipeline().setStages(Array( + document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + converter, + chunkerMapper)) + + +val text= "She is given OxyContin, folic acid, levothyroxine, Norvasc, aspirin, Neurontin" + +val data = Seq(text).toDS.toDF("text") + +val result= pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++-------------+---------------------+-----------------------------------+-----------+ +| ner_chunk| main_category| sub_category|other_names| ++-------------+---------------------+-----------------------------------+-----------+ +| OxyContin| Pain Management| Opioid Analgesics| Oxaydo| +| folic acid| Nutritionals| Vitamins, Water-Soluble| Folvite| +|levothyroxine|Metabolic & Endocrine| Thyroid Products| Levo T| +| Norvasc| Cardiovascular| Antianginal Agents| Katerzia| +| aspirin| Cardiovascular|Antiplatelet Agents, Cardiovascular| ASA| +| Neurontin| Neurologics| GABA Analogs| Gralise| ++-------------+---------------------+-----------------------------------+-----------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|drug_category_mapper| +|Compatibility:|Healthcare NLP 4.2.3+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[ner_chunk]| +|Output Labels:|[mappings]| +|Language:|en| +|Size:|526.0 KB| \ No newline at end of file From f163725a368e69cd97d09178300d48a67b6d6136 Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Sat, 17 Dec 2022 16:06:29 +0700 Subject: [PATCH 43/57] 2022-12-17-ner_sdoh_mentions_en (#13245) * Add model 2022-12-17-ner_sdoh_mentions_en * Update 2022-12-17-ner_sdoh_mentions_en.md Co-authored-by: Damla-Gurbaz Co-authored-by: Cabir C <64752006+Cabir40@users.noreply.github.com> --- .../2022-12-17-ner_sdoh_mentions_en.md | 157 ++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 docs/_posts/Damla-Gurbaz/2022-12-17-ner_sdoh_mentions_en.md diff --git a/docs/_posts/Damla-Gurbaz/2022-12-17-ner_sdoh_mentions_en.md b/docs/_posts/Damla-Gurbaz/2022-12-17-ner_sdoh_mentions_en.md new file mode 100644 index 00000000000000..4e7c5d2262a91f --- /dev/null +++ b/docs/_posts/Damla-Gurbaz/2022-12-17-ner_sdoh_mentions_en.md @@ -0,0 +1,157 @@ +--- +layout: model +title: Detect Social Determinants of Health Mentions +author: John Snow Labs +name: ner_sdoh_mentions +date: 2022-12-17 +tags: [en, licensed, ner, sdoh, mentions, clinical] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.3 +spark_version: 3.0 +supported: true +annotator: MedicalNerModel +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This Named Entity Recognition model is intended for detecting Social Determinants of Health mentions in clinical notes and trained by using MedicalNerApproach annotator that allows to train generic NER models based on Neural Networks. + +## Predicted Entities + +`sdoh_community`, `sdoh_economics`, `sdoh_education`, `sdoh_environment`, `behavior_tobacco`, `behavior_alcohol`, `behavior_drug` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_sdoh_mentions_en_4.2.3_3.0_1671267131454.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document")\ + +sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx")\ + .setInputCols("document")\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols("sentence", "token")\ + .setOutputCol("embeddings") + +ner_model = MedicalNerModel.pretrained("ner_sdoh_mentions", "en", "clinical/models")\ + .setInputCols(["sentence", "token", "embeddings"])\ + .setOutputCol("ner") + +ner_converter = NerConverter()\ + .setInputCols(["sentence", "token", "ner"])\ + .setOutputCol("ner_chunk") + +nlpPipeline = Pipeline(stages=[ + document_assembler, + sentenceDetector, + tokenizer, + embeddings, + ner_model, + ner_converter]) + +df = spark.createDataFrame([["Mr. Known lastname 9880 is a pleasant, cooperative gentleman with a long standing history (20 years) diverticulitis. He is married and has 3 children. He works in a bank. He denies any alcohol or intravenous drug use. He has been smoking for many years."]]).toDF("text") + +result = nlpPipeline.fit(df).transform(df) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner_model = MedicalNerModel.pretrained("ner_sdoh_mentions", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val nlpPipeline = new PipelineModel().setStages(Array(document_assembler, + sentenceDetector, + tokenizer, + embeddings, + ner_model, + ner_converter)) + +val data = Seq("Mr. Known lastname 9880 is a pleasant, cooperative gentleman with a long standing history (20 years) diverticulitis. He is married and has 3 children. He works in a bank. He denies any alcohol or intravenous drug use. He has been smoking for many years.").toDS.toDF("text") + +val result = nlpPipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++----------------+----------------+ +|chunk |ner_label | ++----------------+----------------+ +|married |sdoh_community | +|children |sdoh_community | +|works |sdoh_economics | +|alcohol |behavior_alcohol| +|intravenous drug|behavior_drug | +|smoking |behavior_tobacco| ++----------------+----------------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_sdoh_mentions| +|Compatibility:|Healthcare NLP 4.2.3+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|15.1 MB| + +## Benchmarking + +```bash + label precision recall f1-score support +behavior_alcohol 0.95 0.94 0.94 798 + behavior_drug 0.93 0.92 0.92 366 +behavior_tobacco 0.95 0.95 0.95 936 + sdoh_community 0.97 0.97 0.97 969 + sdoh_economics 0.95 0.91 0.93 363 + sdoh_education 0.69 0.65 0.67 34 +sdoh_environment 0.93 0.90 0.92 651 + micro-avg 0.95 0.94 0.94 4117 + macro-avg 0.91 0.89 0.90 4117 + weighted-avg 0.95 0.94 0.94 4117 +``` From ff8b1a0c83f1f53bc2c20dc6b16fe878a560845c Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Mon, 19 Dec 2022 00:56:51 +0700 Subject: [PATCH 44/57] 2022-12-18-meddroprof_scielowiki_es (#13252) * Add model 2022-12-18-meddroprof_scielowiki_es * Add model 2022-12-18-ner_sdoh_mentions_en * Add model 2022-12-18-bert_sequence_classifier_sdoh_community_absent_status_en * Add model 2022-12-18-bert_sequence_classifier_sdoh_community_present_status_en * Update 2022-12-18-bert_sequence_classifier_sdoh_community_present_status_en.md * Add model 2022-12-18-bert_sequence_classifier_sdoh_environment_status_en * Update 2022-12-18-bert_sequence_classifier_sdoh_community_absent_status_en.md * Update 2022-12-18-bert_sequence_classifier_sdoh_environment_status_en.md * Update 2022-12-18-meddroprof_scielowiki_es.md * Update 2022-12-18-ner_sdoh_mentions_en.md * Add model 2022-12-18-drug_category_mapper_en * Update 2022-12-18-drug_category_mapper_en.md * Update 2022-12-18-bert_sequence_classifier_sdoh_community_absent_status_en.md * Update 2022-12-18-bert_sequence_classifier_sdoh_community_present_status_en.md * Update 2022-12-18-bert_sequence_classifier_sdoh_environment_status_en.md Co-authored-by: Damla-Gurbaz Co-authored-by: Damla Gurbaz <81505007+Damla-Gurbaz@users.noreply.github.com> --- ...ssifier_sdoh_community_absent_status_en.md | 124 ++++++++++++ ...sifier_sdoh_community_present_status_en.md | 125 +++++++++++++ ...e_classifier_sdoh_environment_status_en.md | 128 +++++++++++++ .../2022-12-18-drug_category_mapper_en.md | 158 ++++++++++++++++ .../2022-12-18-meddroprof_scielowiki_es.md | 177 ++++++++++++++++++ .../2022-12-18-ner_sdoh_mentions_en.md | 157 ++++++++++++++++ 6 files changed, 869 insertions(+) create mode 100644 docs/_posts/Damla-Gurbaz/2022-12-18-bert_sequence_classifier_sdoh_community_absent_status_en.md create mode 100644 docs/_posts/Damla-Gurbaz/2022-12-18-bert_sequence_classifier_sdoh_community_present_status_en.md create mode 100644 docs/_posts/Damla-Gurbaz/2022-12-18-bert_sequence_classifier_sdoh_environment_status_en.md create mode 100644 docs/_posts/Damla-Gurbaz/2022-12-18-drug_category_mapper_en.md create mode 100644 docs/_posts/Damla-Gurbaz/2022-12-18-meddroprof_scielowiki_es.md create mode 100644 docs/_posts/Damla-Gurbaz/2022-12-18-ner_sdoh_mentions_en.md diff --git a/docs/_posts/Damla-Gurbaz/2022-12-18-bert_sequence_classifier_sdoh_community_absent_status_en.md b/docs/_posts/Damla-Gurbaz/2022-12-18-bert_sequence_classifier_sdoh_community_absent_status_en.md new file mode 100644 index 00000000000000..2e3fcbac444756 --- /dev/null +++ b/docs/_posts/Damla-Gurbaz/2022-12-18-bert_sequence_classifier_sdoh_community_absent_status_en.md @@ -0,0 +1,124 @@ +--- +layout: model +title: SDOH Community Absent Binary Classification +author: John Snow Labs +name: bert_sequence_classifier_sdoh_community_absent_status +date: 2022-12-18 +tags: [en, licensed, clinical, sequence_classification, classifier, community_absent, sdoh] +task: Text Classification +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +annotator: MedicalBertForSequenceClassification +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model classifies related to the loss of social support such as a family member or friend in the clinical documents. A discharge summary was classified True for Community-Absent if the discharge summary had passages related to the loss of social support and False if such passages were not found in the discharge summary. + +## Predicted Entities + +`True`, `False` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/bert_sequence_classifier_sdoh_community_absent_status_en_4.2.2_3.0_1671370818272.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +tokenizer = Tokenizer()\ + .setInputCols("document")\ + .setOutputCol("token") + +sequenceClassifier = MedicalBertForSequenceClassification.pretrained("bert_sequence_classifier_sdoh_community_absent_status", "en", "clinical/models")\ + .setInputCols(["document","token"])\ + .setOutputCol("class") + +pipeline = Pipeline(stages=[ + document_assembler, + tokenizer, + sequenceClassifier +]) +sample_texts =["She has two adult sons. She is a widow. She was employed with housework. She quit smoking 20 to 30 years ago, but smoked two packs per day for 20 to 30 years. She drinks one glass of wine occasionally. She avoids salt in her diet. ", + "65 year old male presented with several days of vice like chest pain. He states that he felt like his chest was being crushed from back to the front. Lives with spouse and two sons moved to US 1 month ago."] + +data = spark.createDataFrame(sample_texts, StringType()).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val sequenceClassifier = MedicalBertForSequenceClassification.pretrained("bert_sequence_classifier_sdoh_community_absent_status", "en", "clinical/models") + .setInputCols(Array("document","token")) + .setOutputCol("class") + +val pipeline = new Pipeline().setStages(Array(document_assembler, + tokenizer, + sequenceClassifier)) + +val data = Seq("She has two adult sons. She is a widow. She was employed with housework. She quit smoking 20 to 30 years ago, but smoked two packs per day for 20 to 30 years. She drinks one glass of wine occasionally. She avoids salt in her diet.") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++----------------------------------------------------------------------------------------------------+-------+ +| text| result| ++----------------------------------------------------------------------------------------------------+-------+ +|She has two adult sons. She is a widow. She was employed with housework. She quit smoking 20 to 3...| [True]| +|65 year old male presented with several days of vice like chest pain. He states that he felt like...|[False]| ++----------------------------------------------------------------------------------------------------+-------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_sequence_classifier_sdoh_community_absent_status| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[document, token]| +|Output Labels:|[class]| +|Language:|en| +|Size:|410.9 MB| +|Case sensitive:|true| +|Max sentence length:|512| + +## Benchmarking + +```bash + label precision recall f1-score support + False 0.89 0.77 0.83 155 + True 0.63 0.80 0.70 74 + accuracy - - 0.78 229 + macro-avg 0.76 0.79 0.76 229 + weighted-avg 0.80 0.78 0.79 229 +``` diff --git a/docs/_posts/Damla-Gurbaz/2022-12-18-bert_sequence_classifier_sdoh_community_present_status_en.md b/docs/_posts/Damla-Gurbaz/2022-12-18-bert_sequence_classifier_sdoh_community_present_status_en.md new file mode 100644 index 00000000000000..09a31c6940399b --- /dev/null +++ b/docs/_posts/Damla-Gurbaz/2022-12-18-bert_sequence_classifier_sdoh_community_present_status_en.md @@ -0,0 +1,125 @@ +--- +layout: model +title: SDOH Community Present Binary Classification +author: John Snow Labs +name: bert_sequence_classifier_sdoh_community_present_status +date: 2022-12-18 +tags: [en, licensed, clinical, sequence_classification, classifier, community_present, sdoh] +task: Text Classification +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +annotator: MedicalBertForSequenceClassification +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model classifies related to social support such as a family member or friend in the clinical documents. A discharge summary was classified True for Community-Present if the discharge summary had passages related to active social support and False if such passages were not found in the discharge summary. + +## Predicted Entities + +`True`, `False` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/bert_sequence_classifier_sdoh_community_present_status_en_4.2.2_3.0_1671371389301.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +tokenizer = Tokenizer()\ + .setInputCols("document")\ + .setOutputCol("token") + +sequenceClassifier = MedicalBertForSequenceClassification.pretrained("bert_sequence_classifier_sdoh_community_present_status", "en", "clinical/models")\ + .setInputCols(["document","token"])\ + .setOutputCol("class") + +pipeline = Pipeline(stages=[ + document_assembler, + tokenizer, + sequenceClassifier +]) + +sample_texts = ["Right inguinal hernia repair in childhood Cervical discectomy 3 years ago Umbilical hernia repair 2137. Retired schoolteacher, now substitutes. Lives with wife in location 1439. Has a 27 yo son and a 25 yo daughter. Name (NI) past or present smoking hx, no EtOH.", + "Atrial Septal Defect with Right Atrial Thrombus Pulmonary Hypertension Obesity, Obstructive Sleep Apnea. Denies tobacco and ETOH. Works as cafeteria worker."] + +data = spark.createDataFrame(sample_texts, StringType()).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val sequenceClassifier = MedicalBertForSequenceClassification.pretrained("bert_sequence_classifier_sdoh_community_present_status", "en", "clinical/models") + .setInputCols(Array("document","token")) + .setOutputCol("class") + +val pipeline = new Pipeline().setStages(Array(document_assembler, + tokenizer, + sequenceClassifier)) + +val data = Seq("Atrial Septal Defect with Right Atrial Thrombus Pulmonary Hypertension Obesity, Obstructive Sleep Apnea. Denies tobacco and ETOH. Works as cafeteria worker.") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++----------------------------------------------------------------------------------------------------+-------+ +| text| result| ++----------------------------------------------------------------------------------------------------+-------+ +|Right inguinal hernia repair in childhood Cervical discectomy 3 years ago Umbilical hernia repair...| [True]| +|Atrial Septal Defect with Right Atrial Thrombus Pulmonary Hypertension Obesity, Obstructive Sleep...|[False]| ++----------------------------------------------------------------------------------------------------+-------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_sequence_classifier_sdoh_community_present_status| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[document, token]| +|Output Labels:|[class]| +|Language:|en| +|Size:|410.9 MB| +|Case sensitive:|true| +|Max sentence length:|512| + +## Benchmarking + +```bash + label precision recall f1-score support + False 0.95 0.68 0.80 203 + True 0.85 0.98 0.91 359 + accuracy - - 0.87 562 + macro-avg 0.90 0.83 0.85 562 + weighted-avg 0.88 0.87 0.87 562 +``` diff --git a/docs/_posts/Damla-Gurbaz/2022-12-18-bert_sequence_classifier_sdoh_environment_status_en.md b/docs/_posts/Damla-Gurbaz/2022-12-18-bert_sequence_classifier_sdoh_environment_status_en.md new file mode 100644 index 00000000000000..54a81a3d080297 --- /dev/null +++ b/docs/_posts/Damla-Gurbaz/2022-12-18-bert_sequence_classifier_sdoh_environment_status_en.md @@ -0,0 +1,128 @@ +--- +layout: model +title: SDOH Environment Status Classification +author: John Snow Labs +name: bert_sequence_classifier_sdoh_environment_status +date: 2022-12-18 +tags: [en, clinical, sdoh, licensed, sequence_classification, environment_status, classifier] +task: Text Classification +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +annotator: MedicalBertForSequenceClassification +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model classifies related to environment situation such as any indication of housing, homeless or no related passage. A discharge summary was classified as True for the SDOH Environment if there was any indication of housing, False if the patient was homeless and None if there was no related passage. + +## Predicted Entities + +`True`, `False`, `None` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/bert_sequence_classifier_sdoh_environment_status_en_4.2.2_3.0_1671371837321.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +tokenizer = Tokenizer()\ + .setInputCols("document")\ + .setOutputCol("token") + +sequenceClassifier = MedicalBertForSequenceClassification.pretrained("bert_sequence_classifier_sdoh_environment_status", "en", "clinical/models")\ + .setInputCols(["document","token"])\ + .setOutputCol("class") + +pipeline = Pipeline(stages=[ + document_assembler, + tokenizer, + sequenceClassifier +]) + +sample_texts = ["The patient is a 29-year-old female with a history of renal transplant in 2097, who had worsening renal failure for the past several months. Her chief complaints were hypotension and seizure. months prior to admission and had been more hypertensive recently, requiring blood pressure medications. She was noted to have worsening renal function secondary to recent preeclampsia and her blood pressure control was thought to be secondary to renal failure.", + "Mr Known lastname 19017 is a 66 year-old man with a PMHx of stage 4 COPD (FEV1 0.65L;FEV1/FVC 37% predicted in 4-14) on 4L home o2 with numerous hospitalizations for COPD exacerbations and intubation, hypertension, coronary artery disease, GERD who presents with SOB and CP. He is admitted to the ICU for management of dyspnea and hypotension.", + "He was deemed Child's B in 2156-5-17 with ongoing ethanol abuse, admitted to Intensive Care Unit due to acute decompensation of chronic liver disease due to alcoholic hepatitis and Escherichia coli sepsis. after being hit in the head with the a bottle and dropping to the floor in the apartment. They had Trauma work him up including a head computerized tomography scan which was negative. He had abdominal pain for approximately one month with increasing abdominal girth, was noted to be febrile to 100 degrees on presentation and was tachycardiac 130, stable blood pressures. He was noted to have distended abdomen with diffuse tenderness computerized tomography scan of the abdomen which showed ascites and large nodule of the liver, splenomegaly, paraesophageal varices and loops of thickened bowel."] + +data = spark.createDataFrame(sample_texts, StringType()).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val sequenceClassifier = MedicalBertForSequenceClassification.pretrained("bert_sequence_classifier_sdoh_environment_status", "en", "clinical/models") + .setInputCols(Array("document","token")) + .setOutputCol("class") + +val pipeline = new Pipeline().setStages(Array(document_assembler, + tokenizer, + sequenceClassifier)) + +val data = Seq("The patient is a 29-year-old female with a history of renal transplant in 2097, who had worsening renal failure for the past several months. Her chief complaints were hypotension and seizure. months prior to admission and had been more hypertensive recently, requiring blood pressure medications. She was noted to have worsening renal function secondary to recent preeclampsia and her blood pressure control was thought to be secondary to renal failure.") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++----------------------------------------------------------------------------------------------------+-------+ +| text| result| ++----------------------------------------------------------------------------------------------------+-------+ +|The patient is a 29-year-old female with a history of renal transplant in 2097, who had worsening...| [None]| +|Mr Known lastname 19017 is a 66 year-old man with a PMHx of stage 4 COPD (FEV1 0.65L;FEV1/FVC 37%...|[False]| +|He was deemed Child's B in 2156-5-17 with ongoing ethanol abuse, admitted to Intensive Care Unit ...| [True]| ++----------------------------------------------------------------------------------------------------+-------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_sequence_classifier_sdoh_environment_status| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[document, token]| +|Output Labels:|[class]| +|Language:|en| +|Size:|410.9 MB| +|Case sensitive:|true| +|Max sentence length:|512| + +## Benchmarking + +```bash + label precision recall f1-score support + None 0.89 0.78 0.83 277 + False 0.86 0.93 0.90 419 + True 0.67 1.00 0.80 6 + accuracy - - 0.87 702 + macro-avg 0.81 0.90 0.84 702 + weighted-avg 0.87 0.87 0.87 702 +``` diff --git a/docs/_posts/Damla-Gurbaz/2022-12-18-drug_category_mapper_en.md b/docs/_posts/Damla-Gurbaz/2022-12-18-drug_category_mapper_en.md new file mode 100644 index 00000000000000..ce94a9ecca208f --- /dev/null +++ b/docs/_posts/Damla-Gurbaz/2022-12-18-drug_category_mapper_en.md @@ -0,0 +1,158 @@ +--- +layout: model +title: Mapping Drugs to Their Categories as well as Other Brand and Names +author: John Snow Labs +name: drug_category_mapper +date: 2022-12-18 +tags: [category, chunk_mapper, drug, licensed, clinical, en] +task: Chunk Mapping +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +annotator: ChunkMapperModel +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This pretrained model maps drugs to their categories and other brands and names. It has two categories called main category and subcategory. + +## Predicted Entities + +`main_category`, `sub_category`, `other_name` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/26.Chunk_Mapping.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/drug_category_mapper_en_4.2.2_3.0_1671374094037.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols("sentence")\ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_posology", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +chunkerMapper = ChunkMapperModel.pretrained("drug_category_mapper", "en", "clinical/models")\ + .setInputCols(["ner_chunk"])\ + .setOutputCol("mappings")\ + .setRels(["main_category", "sub_category", "other_name"]) + +pipeline = Pipeline().setStages([ + document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + converter, + chunkerMapper]) + +text= "She is given OxyContin, folic acid, levothyroxine, Norvasc, aspirin, Neurontin" + +data = spark.createDataFrame([[text]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = new SentenceDetector() + .setInputCols(Array("document")) + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_posology", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val chunkerMapper = ChunkMapperModel.pretrained("drug_category_mapper", "en", "clinical/models") + .setInputCols("ner_chunk") + .setOutputCol("mappings") + .setRels(Array(["main_category", "sub_category", "other_name"])) + +val pipeline = new Pipeline().setStages(Array( + document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + converter, + chunkerMapper)) + +val text= "She is given OxyContin, folic acid, levothyroxine, Norvasc, aspirin, Neurontin" + +val data = Seq(text).toDS.toDF("text") + +val result= pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++-------------+---------------------+-----------------------------------+-----------+ +| ner_chunk| main_category| sub_category|other_names| ++-------------+---------------------+-----------------------------------+-----------+ +| OxyContin| Pain Management| Opioid Analgesics| Oxaydo| +| folic acid| Nutritionals| Vitamins, Water-Soluble| Folvite| +|levothyroxine|Metabolic & Endocrine| Thyroid Products| Levo T| +| Norvasc| Cardiovascular| Antianginal Agents| Katerzia| +| aspirin| Cardiovascular|Antiplatelet Agents, Cardiovascular| ASA| +| Neurontin| Neurologics| GABA Analogs| Gralise| ++-------------+---------------------+-----------------------------------+-----------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|drug_category_mapper| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[ner_chunk]| +|Output Labels:|[mappings]| +|Language:|en| +|Size:|526.0 KB| diff --git a/docs/_posts/Damla-Gurbaz/2022-12-18-meddroprof_scielowiki_es.md b/docs/_posts/Damla-Gurbaz/2022-12-18-meddroprof_scielowiki_es.md new file mode 100644 index 00000000000000..87b8ba27435af8 --- /dev/null +++ b/docs/_posts/Damla-Gurbaz/2022-12-18-meddroprof_scielowiki_es.md @@ -0,0 +1,177 @@ +--- +layout: model +title: Professions & Occupations NER model in Spanish (meddroprof_scielowiki) +author: John Snow Labs +name: meddroprof_scielowiki +date: 2022-12-18 +tags: [ner, licensed, prefessions, es, occupations] +task: Named Entity Recognition +language: es +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +annotator: MedicalNerModel +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +NER model that detects professions and occupations in Spanish texts. Trained with the `embeddings_scielowiki_300d` embeddings, and the same `WordEmbeddingsModel` is needed in the pipeline. + +## Predicted Entities + +`ACTIVIDAD`, `PROFESION`, `SITUACION_LABORAL` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/NER_PROFESSIONS_ES/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/healthcare/NER_PROFESSIONS_ES.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/meddroprof_scielowiki_es_4.2.2_3.0_1671367707210.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence = SentenceDetector() \ + .setInputCols("document") \ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols("sentence") \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel.pretrained("embeddings_scielowiki_300d", "es", "clinical/models")\ + .setInputCols(["document", "token"])\ + .setOutputCol("embeddings") + +clinical_ner = MedicalNerModel.pretrained("meddroprof_scielowiki", "es", "clinical/models")\ + .setInputCols(["sentence", "token", "embeddings"])\ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[ + document_assembler, + sentence, + tokenizer, + word_embeddings, + clinical_ner, + ner_converter]) + +sample_text = """La paciente es la mayor de 2 hermanos, tiene un hermano de 13 años estudiando 1o ESO. Sus padres son ambos ATS , trabajan en diferentes centros de salud estudiando 1o ESO""" + +df = spark.createDataFrame([[sample_text]]).toDF("text") + +result = pipeline.fit(df).transform(df) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_scielowiki_300d", "es", "clinical/models") + .setInputCols(Array("document", "token")) + .setOutputCol("word_embeddings") + +val clinical_ner = MedicalNerModel.pretrained("meddroprof_scielowiki", "es", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence, + tokenizer, + word_embeddings, + clinical_ner, + ner_converter)) + +val data = Seq("""La paciente es la mayor de 2 hermanos, tiene un hermano de 13 años estudiando 1o ESO. Sus padres son ambos ATS , trabajan en diferentes centros de salud estudiando 1o ESO""").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++---------------------------------------+-----------------+ +|chunk |ner_label | ++---------------------------------------+-----------------+ +|estudiando 1o ESO |SITUACION_LABORAL| +|ATS |PROFESION | +|trabajan en diferentes centros de salud|PROFESION | +|estudiando 1o ESO |SITUACION_LABORAL| ++---------------------------------------+-----------------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|meddroprof_scielowiki| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|es| +|Size:|14.8 MB| + +## References + +The model was trained with the [MEDDOPROF](https://temu.bsc.es/meddoprof/data/) data set: + + +> The MEDDOPROF corpus is a collection of 1844 clinical cases from over 20 different specialties annotated with professions and employment statuses. The corpus was annotated by a team composed of linguists and clinical experts following specially prepared annotation guidelines, after several cycles of quality control and annotation consistency analysis before annotating the entire dataset. Figure 1 shows a screenshot of a sample manual annotation generated using the brat annotation tool. + +Reference: + + +``` +@article{meddoprof, + title={NLP applied to occupational health: MEDDOPROF shared task at IberLEF 2022 on automatic recognition, classification and normalization of professions and occupations from medical texts}, + author={Lima-López, Salvador and Farré-Maduell, Eulàlia and Miranda-Escalada, Antonio and Brivá-Iglesias, Vicent and Krallinger, Martin}, +journal = {Procesamiento del Lenguaje Natural}, +volume = {67}, + year={2022} +} +``` + +## Benchmarking + +```bash +label precision recall f1-score support +B-ACTIVIDAD 0.82 0.36 0.50 25 +B-PROFESION 0.87 0.75 0.81 634 +B-SITUACION_LABORAL 0.79 0.67 0.72 310 +I-ACTIVIDAD 0.86 0.43 0.57 58 +I-PROFESION 0.87 0.80 0.83 944 +I-SITUACION_LABORAL 0.74 0.71 0.73 407 +O 1.00 1.00 1.00 139880 +accuracy - - 0.99 142258 +macro-avg 0.85 0.67 0.74 142258 +weighted-avg 0.99 0.99 0.99 142258 +``` diff --git a/docs/_posts/Damla-Gurbaz/2022-12-18-ner_sdoh_mentions_en.md b/docs/_posts/Damla-Gurbaz/2022-12-18-ner_sdoh_mentions_en.md new file mode 100644 index 00000000000000..882262daf14194 --- /dev/null +++ b/docs/_posts/Damla-Gurbaz/2022-12-18-ner_sdoh_mentions_en.md @@ -0,0 +1,157 @@ +--- +layout: model +title: Detect Social Determinants of Health Mentions +author: John Snow Labs +name: ner_sdoh_mentions +date: 2022-12-18 +tags: [en, licensed, ner, sdoh, mentions, clinical] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.2 +spark_version: 3.0 +supported: true +annotator: MedicalNerModel +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This Named Entity Recognition model is intended for detecting Social Determinants of Health mentions in clinical notes and trained by using MedicalNerApproach annotator that allows to train generic NER models based on Neural Networks. + +## Predicted Entities + +`sdoh_community`, `sdoh_economics`, `sdoh_education`, `sdoh_environment`, `behavior_tobacco`, `behavior_alcohol`, `behavior_drug` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_sdoh_mentions_en_4.2.2_3.0_1671369830893.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document")\ + +sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx")\ + .setInputCols("document")\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols("sentence", "token")\ + .setOutputCol("embeddings") + +ner_model = MedicalNerModel.pretrained("ner_sdoh_mentions", "en", "clinical/models")\ + .setInputCols(["sentence", "token", "embeddings"])\ + .setOutputCol("ner") + +ner_converter = NerConverter()\ + .setInputCols(["sentence", "token", "ner"])\ + .setOutputCol("ner_chunk") + +nlpPipeline = Pipeline(stages=[ + document_assembler, + sentenceDetector, + tokenizer, + embeddings, + ner_model, + ner_converter]) + +df = spark.createDataFrame([["Mr. Known lastname 9880 is a pleasant, cooperative gentleman with a long standing history (20 years) diverticulitis. He is married and has 3 children. He works in a bank. He denies any alcohol or intravenous drug use. He has been smoking for many years."]]).toDF("text") + +result = nlpPipeline.fit(df).transform(df) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner_model = MedicalNerModel.pretrained("ner_sdoh_mentions", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val nlpPipeline = new PipelineModel().setStages(Array(document_assembler, + sentenceDetector, + tokenizer, + embeddings, + ner_model, + ner_converter)) + +val data = Seq("Mr. Known lastname 9880 is a pleasant, cooperative gentleman with a long standing history (20 years) diverticulitis. He is married and has 3 children. He works in a bank. He denies any alcohol or intravenous drug use. He has been smoking for many years.").toDS.toDF("text") + +val result = nlpPipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++----------------+----------------+ +|chunk |ner_label | ++----------------+----------------+ +|married |sdoh_community | +|children |sdoh_community | +|works |sdoh_economics | +|alcohol |behavior_alcohol| +|intravenous drug|behavior_drug | +|smoking |behavior_tobacco| ++----------------+----------------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_sdoh_mentions| +|Compatibility:|Healthcare NLP 4.2.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|15.1 MB| + +## Benchmarking + +```bash + label precision recall f1-score support +behavior_alcohol 0.95 0.94 0.94 798 + behavior_drug 0.93 0.92 0.92 366 +behavior_tobacco 0.95 0.95 0.95 936 + sdoh_community 0.97 0.97 0.97 969 + sdoh_economics 0.95 0.91 0.93 363 + sdoh_education 0.69 0.65 0.67 34 +sdoh_environment 0.93 0.90 0.92 651 + micro avg 0.95 0.94 0.94 4117 + macro avg 0.91 0.89 0.90 4117 + weighted avg 0.95 0.94 0.94 4117 +``` From 038f9d59ab13be7d3d7aec74c1e884ad846dde66 Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Mon, 19 Dec 2022 06:02:01 +0700 Subject: [PATCH 45/57] Add model 2022-12-18-ner_sdoh_mentions_test_en (#13259) Co-authored-by: Cabir40 --- .../2022-12-18-ner_sdoh_mentions_test_en.md | 145 ++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 docs/_posts/Cabir40/2022-12-18-ner_sdoh_mentions_test_en.md diff --git a/docs/_posts/Cabir40/2022-12-18-ner_sdoh_mentions_test_en.md b/docs/_posts/Cabir40/2022-12-18-ner_sdoh_mentions_test_en.md new file mode 100644 index 00000000000000..84c2901adebcd8 --- /dev/null +++ b/docs/_posts/Cabir40/2022-12-18-ner_sdoh_mentions_test_en.md @@ -0,0 +1,145 @@ +--- +layout: model +title: Detect Social Determinants of Health Mentions +author: John Snow Labs +name: ner_sdoh_mentions_test +date: 2022-12-18 +tags: [en, licence, test, ner, sdoh, mentions, clinical, licensed] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.3 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This Named Entity Recognition model is intended for detecting Social Determinants of Health mentions in clinical notes and trained by using MedicalNerApproach annotator that allows to train generic NER models based on Neural Networks. + +## Predicted Entities + +`sdoh_community`, `sdoh_economics`, `sdoh_education`, `sdoh_environment`, `behavior_tobacco`, `behavior_alcohol`, `behavior_drug` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_sdoh_mentions_test_en_4.2.3_3.0_1671404339484.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document")\ + +sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx")\ + .setInputCols("document")\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols("sentence", "token")\ + .setOutputCol("embeddings") + +ner_model = MedicalNerModel.pretrained("ner_sdoh_mentions_test", "en", "clinical/models")\ + .setInputCols(["sentence", "token", "embeddings"])\ + .setOutputCol("ner") +ner_converter = NerConverter()\ + .setInputCols(["sentence", "token", "ner"])\ + .setOutputCol("ner_chunk") + +nlpPipeline = Pipeline(stages=[ + document_assembler, + sentenceDetector, + tokenizer, + embeddings, + ner_model, + ner_converter]) + +df = spark.createDataFrame([["Mr. Known lastname 9880 is a pleasant, cooperative gentleman with a long standing history (20 years) diverticulitis. He is married and has 3 children. He works in a bank. He denies any alcohol or intravenous drug use. He has been smoking for many years."]]).toDF("text") +result = nlpPipeline.fit(df).transform(df) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") +val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") + .setInputCols("document") + .setOutputCol("sentence") +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") +val embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") +val ner_model = MedicalNerModel.pretrained("ner_sdoh_mentions_test", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") +val nlpPipeline = new PipelineModel().setStages(Array(document_assembler, + sentenceDetector, + tokenizer, + embeddings, + ner_model, + ner_converter)) +val data = Seq("Mr. Known lastname 9880 is a pleasant, cooperative gentleman with a long standing history (20 years) diverticulitis. He is married and has 3 children. He works in a bank. He denies any alcohol or intravenous drug use. He has been smoking for many years.").toDS.toDF("text") +val result = nlpPipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++----------------+----------------+ +|chunk |ner_label | ++----------------+----------------+ +|married |sdoh_community | +|children |sdoh_community | +|works |sdoh_economics | +|alcohol |behavior_alcohol| +|intravenous drug|behavior_drug | +|smoking |behavior_tobacco| ++----------------+----------------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_sdoh_mentions_test| +|Compatibility:|Healthcare NLP 4.2.3+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|15.1 MB| + +## Benchmarking + +```bash + label precision recall f1-score support +behavior_alcohol 0.95 0.94 0.94 798 + behavior_drug 0.93 0.92 0.92 366 +behavior_tobacco 0.95 0.95 0.95 936 + sdoh_community 0.97 0.97 0.97 969 + sdoh_economics 0.95 0.91 0.93 363 + sdoh_education 0.69 0.65 0.67 34 +sdoh_environment 0.93 0.90 0.92 651 + micro-avg 0.95 0.94 0.94 4117 + macro-avg 0.91 0.89 0.90 4117 + weighted-avg 0.95 0.94 0.94 4117 +``` \ No newline at end of file From d3d8b7c0ed98266e9abb3edde050a977a7c553f2 Mon Sep 17 00:00:00 2001 From: Cabir C <64752006+Cabir40@users.noreply.github.com> Date: Mon, 19 Dec 2022 17:20:12 +0300 Subject: [PATCH 46/57] Delete 2022-12-15-drug_category_mapper_en.md --- .../2022-12-15-drug_category_mapper_en.md | 159 ------------------ 1 file changed, 159 deletions(-) delete mode 100644 docs/_posts/Ahmetemintek/2022-12-15-drug_category_mapper_en.md diff --git a/docs/_posts/Ahmetemintek/2022-12-15-drug_category_mapper_en.md b/docs/_posts/Ahmetemintek/2022-12-15-drug_category_mapper_en.md deleted file mode 100644 index 09741907942e0c..00000000000000 --- a/docs/_posts/Ahmetemintek/2022-12-15-drug_category_mapper_en.md +++ /dev/null @@ -1,159 +0,0 @@ ---- -layout: model -title: Mapping Drugs to Their Categories as well as Other Brand and Names -author: John Snow Labs -name: drug_category_mapper -date: 2022-12-15 -tags: [category, chunk_mapper, drug, licensed, clinical, en] -task: Chunk Mapping -language: en -edition: Healthcare NLP 4.2.3 -spark_version: 3.0 -supported: true -article_header: - type: cover -use_language_switcher: "Python-Scala-Java" ---- - -## Description - -This pretrained model maps drugs to their categories and other brands and names. It has two categories called main category and subcategory. - -## Predicted Entities - -`main_category`, `sub_category`, `other_name` - -{:.btn-box} - -[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/26.Chunk_Mapping.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/drug_category_mapper_en_4.2.3_3.0_1671100791411.zip){:.button.button-orange.button-orange-trans.arr.button-icon} - -## How to use - - - -
-{% include programmingLanguageSelectScalaPythonNLU.html %} -```python -document_assembler = DocumentAssembler()\ - .setInputCol("text")\ - .setOutputCol("document") - -sentence_detector = SentenceDetector()\ - .setInputCols(["document"])\ - .setOutputCol("sentence") - -tokenizer = Tokenizer()\ - .setInputCols("sentence")\ - .setOutputCol("token") - -word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ - .setInputCols(["sentence", "token"])\ - .setOutputCol("embeddings") - -ner = MedicalNerModel.pretrained("ner_posology", "en", "clinical/models") \ - .setInputCols(["sentence", "token", "embeddings"]) \ - .setOutputCol("ner") - -converter = NerConverter() \ - .setInputCols(["sentence", "token", "ner"]) \ - .setOutputCol("ner_chunk")\ - -chunkerMapper = ChunkMapperModel.pretrained("drug_category_mapper", "en", "clinical/models")\ - .setInputCols(["ner_chunk"])\ - .setOutputCol("mappings")\ - .setRels(["main_category", "sub_category", "other_name"])\ - -pipeline = Pipeline().setStages([ - document_assembler, - sentence_detector, - tokenizer, - word_embeddings, - ner, - converter, - chunkerMapper]) - - -text= "She is given OxyContin, folic acid, levothyroxine, Norvasc, aspirin, Neurontin" - -data = spark.createDataFrame([[text]]).toDF("text") - -result = pipeline.fit(data).transform(data) -``` -```scala -val document_assembler = new DocumentAssembler() - .setInputCol("text") - .setOutputCol("document") - -val sentence_detector = new SentenceDetector() - .setInputCols(Array("document")) - .setOutputCol("sentence") - -val tokenizer = new Tokenizer() - .setInputCols("sentence") - .setOutputCol("token") - -val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") - .setInputCols(Array("sentence", "token")) - .setOutputCol("embeddings") - -val ner = MedicalNerModel.pretrained("ner_posology", "en", "clinical/models") - .setInputCols(Array("sentence", "token", "embeddings")) - .setOutputCol("ner") - -val converter = new NerConverter() - .setInputCols(Array("sentence", "token", "ner")) - .setOutputCol("ner_chunk") - -val chunkerMapper = ChunkMapperModel.pretrained("drug_category_mapper", "en", "clinical/models") - .setInputCols("ner_chunk") - .setOutputCol("mappings") - .setRels(Array(["main_category", "sub_category", "other_name"])) - - -val pipeline = new Pipeline().setStages(Array( - document_assembler, - sentence_detector, - tokenizer, - word_embeddings, - ner, - converter, - chunkerMapper)) - - -val text= "She is given OxyContin, folic acid, levothyroxine, Norvasc, aspirin, Neurontin" - -val data = Seq(text).toDS.toDF("text") - -val result= pipeline.fit(data).transform(data) -``` -
- -## Results - -```bash -+-------------+---------------------+-----------------------------------+-----------+ -| ner_chunk| main_category| sub_category|other_names| -+-------------+---------------------+-----------------------------------+-----------+ -| OxyContin| Pain Management| Opioid Analgesics| Oxaydo| -| folic acid| Nutritionals| Vitamins, Water-Soluble| Folvite| -|levothyroxine|Metabolic & Endocrine| Thyroid Products| Levo T| -| Norvasc| Cardiovascular| Antianginal Agents| Katerzia| -| aspirin| Cardiovascular|Antiplatelet Agents, Cardiovascular| ASA| -| Neurontin| Neurologics| GABA Analogs| Gralise| -+-------------+---------------------+-----------------------------------+-----------+ -``` - -{:.model-param} -## Model Information - -{:.table-model} -|---|---| -|Model Name:|drug_category_mapper| -|Compatibility:|Healthcare NLP 4.2.3+| -|License:|Licensed| -|Edition:|Official| -|Input Labels:|[ner_chunk]| -|Output Labels:|[mappings]| -|Language:|en| -|Size:|526.0 KB| \ No newline at end of file From 3e83552364c690b2f5a5ff54caabea1d9e6d68a1 Mon Sep 17 00:00:00 2001 From: Cabir C <64752006+Cabir40@users.noreply.github.com> Date: Mon, 19 Dec 2022 17:21:08 +0300 Subject: [PATCH 47/57] Delete 2022-12-17-ner_sdoh_mentions_en.md --- .../2022-12-17-ner_sdoh_mentions_en.md | 157 ------------------ 1 file changed, 157 deletions(-) delete mode 100644 docs/_posts/Damla-Gurbaz/2022-12-17-ner_sdoh_mentions_en.md diff --git a/docs/_posts/Damla-Gurbaz/2022-12-17-ner_sdoh_mentions_en.md b/docs/_posts/Damla-Gurbaz/2022-12-17-ner_sdoh_mentions_en.md deleted file mode 100644 index 4e7c5d2262a91f..00000000000000 --- a/docs/_posts/Damla-Gurbaz/2022-12-17-ner_sdoh_mentions_en.md +++ /dev/null @@ -1,157 +0,0 @@ ---- -layout: model -title: Detect Social Determinants of Health Mentions -author: John Snow Labs -name: ner_sdoh_mentions -date: 2022-12-17 -tags: [en, licensed, ner, sdoh, mentions, clinical] -task: Named Entity Recognition -language: en -edition: Healthcare NLP 4.2.3 -spark_version: 3.0 -supported: true -annotator: MedicalNerModel -article_header: - type: cover -use_language_switcher: "Python-Scala-Java" ---- - -## Description - -This Named Entity Recognition model is intended for detecting Social Determinants of Health mentions in clinical notes and trained by using MedicalNerApproach annotator that allows to train generic NER models based on Neural Networks. - -## Predicted Entities - -`sdoh_community`, `sdoh_economics`, `sdoh_education`, `sdoh_environment`, `behavior_tobacco`, `behavior_alcohol`, `behavior_drug` - -{:.btn-box} - - -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_sdoh_mentions_en_4.2.3_3.0_1671267131454.zip){:.button.button-orange.button-orange-trans.arr.button-icon} - -## How to use - - - -
-{% include programmingLanguageSelectScalaPythonNLU.html %} - -```python -document_assembler = DocumentAssembler()\ - .setInputCol("text")\ - .setOutputCol("document")\ - -sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx")\ - .setInputCols("document")\ - .setOutputCol("sentence") - -tokenizer = Tokenizer()\ - .setInputCols(["sentence"])\ - .setOutputCol("token") - -embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ - .setInputCols("sentence", "token")\ - .setOutputCol("embeddings") - -ner_model = MedicalNerModel.pretrained("ner_sdoh_mentions", "en", "clinical/models")\ - .setInputCols(["sentence", "token", "embeddings"])\ - .setOutputCol("ner") - -ner_converter = NerConverter()\ - .setInputCols(["sentence", "token", "ner"])\ - .setOutputCol("ner_chunk") - -nlpPipeline = Pipeline(stages=[ - document_assembler, - sentenceDetector, - tokenizer, - embeddings, - ner_model, - ner_converter]) - -df = spark.createDataFrame([["Mr. Known lastname 9880 is a pleasant, cooperative gentleman with a long standing history (20 years) diverticulitis. He is married and has 3 children. He works in a bank. He denies any alcohol or intravenous drug use. He has been smoking for many years."]]).toDF("text") - -result = nlpPipeline.fit(df).transform(df) -``` -```scala -val document_assembler = new DocumentAssembler() - .setInputCol("text") - .setOutputCol("document") - -val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") - .setInputCols("document") - .setOutputCol("sentence") - -val tokenizer = new Tokenizer() - .setInputCols("sentence") - .setOutputCol("token") - -val embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") - .setInputCols(Array("sentence", "token")) - .setOutputCol("embeddings") - -val ner_model = MedicalNerModel.pretrained("ner_sdoh_mentions", "en", "clinical/models") - .setInputCols(Array("sentence", "token", "embeddings")) - .setOutputCol("ner") - -val ner_converter = new NerConverter() - .setInputCols(Array("sentence", "token", "ner")) - .setOutputCol("ner_chunk") - -val nlpPipeline = new PipelineModel().setStages(Array(document_assembler, - sentenceDetector, - tokenizer, - embeddings, - ner_model, - ner_converter)) - -val data = Seq("Mr. Known lastname 9880 is a pleasant, cooperative gentleman with a long standing history (20 years) diverticulitis. He is married and has 3 children. He works in a bank. He denies any alcohol or intravenous drug use. He has been smoking for many years.").toDS.toDF("text") - -val result = nlpPipeline.fit(data).transform(data) -``` -
- -## Results - -```bash -+----------------+----------------+ -|chunk |ner_label | -+----------------+----------------+ -|married |sdoh_community | -|children |sdoh_community | -|works |sdoh_economics | -|alcohol |behavior_alcohol| -|intravenous drug|behavior_drug | -|smoking |behavior_tobacco| -+----------------+----------------+ -``` - -{:.model-param} -## Model Information - -{:.table-model} -|---|---| -|Model Name:|ner_sdoh_mentions| -|Compatibility:|Healthcare NLP 4.2.3+| -|License:|Licensed| -|Edition:|Official| -|Input Labels:|[sentence, token, embeddings]| -|Output Labels:|[ner]| -|Language:|en| -|Size:|15.1 MB| - -## Benchmarking - -```bash - label precision recall f1-score support -behavior_alcohol 0.95 0.94 0.94 798 - behavior_drug 0.93 0.92 0.92 366 -behavior_tobacco 0.95 0.95 0.95 936 - sdoh_community 0.97 0.97 0.97 969 - sdoh_economics 0.95 0.91 0.93 363 - sdoh_education 0.69 0.65 0.67 34 -sdoh_environment 0.93 0.90 0.92 651 - micro-avg 0.95 0.94 0.94 4117 - macro-avg 0.91 0.89 0.90 4117 - weighted-avg 0.95 0.94 0.94 4117 -``` From c5701b5fadecacca73130e4530260daa395a4c35 Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Fri, 6 Jan 2023 23:14:11 +0700 Subject: [PATCH 48/57] Add model 2023-01-06-redl_clinical_biobert_en (#13313) --- .../2023-01-06-redl_clinical_biobert_en.md | 223 ++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 docs/_posts/Cabir40/2023-01-06-redl_clinical_biobert_en.md diff --git a/docs/_posts/Cabir40/2023-01-06-redl_clinical_biobert_en.md b/docs/_posts/Cabir40/2023-01-06-redl_clinical_biobert_en.md new file mode 100644 index 00000000000000..69ee5d58ac5aec --- /dev/null +++ b/docs/_posts/Cabir40/2023-01-06-redl_clinical_biobert_en.md @@ -0,0 +1,223 @@ +--- +layout: model +title: Extract relations between problem, treatment and test entities (ReDL) +author: John Snow Labs +name: redl_clinical_biobert +date: 2023-01-06 +tags: [licensed, clinical, en, relation_extraction, tensorflow] +task: Relation Extraction +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Extract relations like `TrIP` : a certain treatment has improved a medical problem and 7 other such relations between problem, treatment and test entities. + +## Predicted Entities + +`PIP`, `TeCP`, `TeRP`, `TrAP`, `TrCP`, `TrIP`, `TrNAP`, `TrWP` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/RE_CLINICAL/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.Clinical_Relation_Extraction.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/redl_clinical_biobert_en_4.2.4_3.0_1673020165617.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documenter = DocumentAssembler()\ +.setInputCol("text")\ +.setOutputCol("document") + +sentencer = SentenceDetector()\ +.setInputCols(["document"])\ +.setOutputCol("sentences") + +tokenizer = sparknlp.annotators.Tokenizer()\ +.setInputCols(["sentences"])\ +.setOutputCol("tokens") + +pos_tagger = PerceptronModel()\ +.pretrained("pos_clinical", "en", "clinical/models") \ +.setInputCols(["sentences", "tokens"])\ +.setOutputCol("pos_tags") + +words_embedder = WordEmbeddingsModel() \ +.pretrained("embeddings_clinical", "en", "clinical/models") \ +.setInputCols(["sentences", "tokens"]) \ +.setOutputCol("embeddings") + +ner_tagger = MedicalNerModel() \ +.pretrained("ner_clinical", "en", "clinical/models") \ +.setInputCols(["sentences", "tokens", "embeddings"]) \ +.setOutputCol("ner_tags") + +ner_converter = NerConverter() \ +.setInputCols(["sentences", "tokens", "ner_tags"]) \ +.setOutputCol("ner_chunks") + +dependency_parser = DependencyParserModel() \ +.pretrained("dependency_conllu", "en") \ +.setInputCols(["sentences", "pos_tags", "tokens"]) \ +.setOutputCol("dependencies") + +# Set a filter on pairs of named entities which will be treated as relation candidates +re_ner_chunk_filter = RENerChunksFilter() \ +.setInputCols(["ner_chunks", "dependencies"])\ +.setMaxSyntacticDistance(10)\ +.setOutputCol("re_ner_chunks")\ +.setRelationPairs(["problem-test", "problem-treatment"]) + +# The dataset this model is trained to is sentence-wise. +# This model can also be trained on document-level relations - in which case, while predicting, use "document" instead of "sentence" as input. +re_model = RelationExtractionDLModel()\ +.pretrained('redl_clinical_biobert', 'en', "clinical/models") \ +.setPredictionThreshold(0.5)\ +.setInputCols(["re_ner_chunks", "sentences"]) \ +.setOutputCol("relations") + +pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, pos_tagger, words_embedder, ner_tagger, ner_converter, dependency_parser, re_ner_chunk_filter, re_model]) + +text ="""A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus ( T2DM ), +one prior episode of HTG-induced pancreatitis three years prior to presentation, associated with an acute hepatitis , and obesity with a body mass index ( BMI ) of 33.5 kg/m2 , presented with a one-week history of polyuria , polydipsia , poor appetite , and vomiting . Two weeks prior to presentation , she was treated with a five-day course of amoxicillin for a respiratory tract infection . She was on metformin , glipizide , and dapagliflozin for T2DM and atorvastatin and gemfibrozil for HTG . She had been on dapagliflozin for six months at the time of presentation. Physical examination on presentation was significant for dry oral mucosa ; significantly , her abdominal examination was benign with no tenderness , guarding , or rigidity . Pertinent laboratory findings on admission were : serum glucose 111 mg/dl , bicarbonate 18 mmol/l , anion gap 20 , creatinine 0.4 mg/dL , triglycerides 508 mg/dL , total cholesterol 122 mg/dL , glycated hemoglobin ( HbA1c ) 10% , and venous pH 7.27 . Serum lipase was normal at 43 U/L . Serum acetone levels could not be assessed as blood samples kept hemolyzing due to significant lipemia . The patient was initially admitted for starvation ketosis , as she reported poor oral intake for three days prior to admission . However , serum chemistry obtained six hours after presentation revealed her glucose was 186 mg/dL , the anion gap was still elevated at 21 , serum bicarbonate was 16 mmol/L , triglyceride level peaked at 2050 mg/dL , and lipase was 52 U/L . The β-hydroxybutyrate level was obtained and found to be elevated at 5.29 mmol/L - the original sample was centrifuged and the chylomicron layer removed prior to analysis due to interference from turbidity caused by lipemia again . The patient was treated with an insulin drip for euDKA and HTG with a reduction in the anion gap to 13 and triglycerides to 1400 mg/dL , within 24 hours . Her euDKA was thought to be precipitated by her respiratory tract infection in the setting of SGLT2 inhibitor use . The patient was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , 12 units of insulin lispro with meals , and metformin 1000 mg two times a day . It was determined that all SGLT2 inhibitors should be discontinued indefinitely . +She had close follow-up with endocrinology post discharge . +""" + +data = spark.createDataFrame([[text]]).toDF("text") + +p_model = pipeline.fit(data) + +result = p_model.transform(data) +``` +```scala +val documenter = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val sentencer = new SentenceDetector() +.setInputCols(Array("document")) +.setOutputCol("sentences") + +val tokenizer = new Tokenizer() +.setInputCols(Array("sentences")) +.setOutputCol("tokens") + +val pos_tagger = PerceptronModel() +.pretrained("pos_clinical", "en", "clinical/models") +.setInputCols(Array("sentences", "tokens")) +.setOutputCol("pos_tags") + +val words_embedder = WordEmbeddingsModel() +.pretrained("embeddings_clinical", "en", "clinical/models") +.setInputCols(Array("sentences", "tokens")) +.setOutputCol("embeddings") + +val ner_tagger = MedicalNerModel.pretrained("ner_clinical", "en", "clinical/models") +.setInputCols(Array("sentences", "tokens", "embeddings")) +.setOutputCol("ner_tags") + +val ner_converter = new NerConverter() +.setInputCols(Array("sentences", "tokens", "ner_tags")) +.setOutputCol("ner_chunks") + +val dependency_parser = DependencyParserModel() +.pretrained("dependency_conllu", "en") +.setInputCols(Array("sentences", "pos_tags", "tokens")) +.setOutputCol("dependencies") + +// Set a filter on pairs of named entities which will be treated as relation candidates +val re_ner_chunk_filter = RENerChunksFilter() +.setInputCols(Array("ner_chunks", "dependencies")) +.setMaxSyntacticDistance(10) +.setOutputCol("re_ner_chunks") +.setRelationPairs(Array("problem-test", "problem-treatment")) + +// The dataset this model is trained to is sentence-wise. +// This model can also be trained on document-level relations - in which case, while predicting, use "document" instead of "sentence" as input. +val re_model = RelationExtractionDLModel() +.pretrained("redl_clinical_biobert", "en", "clinical/models") +.setPredictionThreshold(0.5) +.setInputCols(Array("re_ner_chunks", "sentences")) +.setOutputCol("relations") + +val pipeline = new Pipeline().setStages(Array(documenter, sentencer, tokenizer, pos_tagger, words_embedder, ner_tagger, ner_converter, dependency_parser, re_ner_chunk_filter, re_model)) + +val data = Seq("""A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus ( T2DM ), one prior episode of HTG-induced pancreatitis three years prior to presentation, associated with an acute hepatitis , and obesity with a body mass index ( BMI ) of 33.5 kg/m2 , presented with a one-week history of polyuria , polydipsia , poor appetite , and vomiting . Two weeks prior to presentation , she was treated with a five-day course of amoxicillin for a respiratory tract infection . She was on metformin , glipizide , and dapagliflozin for T2DM and atorvastatin and gemfibrozil for HTG . She had been on dapagliflozin for six months at the time of presentation. Physical examination on presentation was significant for dry oral mucosa ; significantly , her abdominal examination was benign with no tenderness , guarding , or rigidity . Pertinent laboratory findings on admission were : serum glucose 111 mg/dl , bicarbonate 18 mmol/l , anion gap 20 , creatinine 0.4 mg/dL , triglycerides 508 mg/dL , total cholesterol 122 mg/dL , glycated hemoglobin ( HbA1c ) 10% , and venous pH 7.27 . Serum lipase was normal at 43 U/L . Serum acetone levels could not be assessed as blood samples kept hemolyzing due to significant lipemia . The patient was initially admitted for starvation ketosis , as she reported poor oral intake for three days prior to admission . However , serum chemistry obtained six hours after presentation revealed her glucose was 186 mg/dL , the anion gap was still elevated at 21 , serum bicarbonate was 16 mmol/L , triglyceride level peaked at 2050 mg/dL , and lipase was 52 U/L . The β-hydroxybutyrate level was obtained and found to be elevated at 5.29 mmol/L - the original sample was centrifuged and the chylomicron layer removed prior to analysis due to interference from turbidity caused by lipemia again . The patient was treated with an insulin drip for euDKA and HTG with a reduction in the anion gap to 13 and triglycerides to 1400 mg/dL , within 24 hours . Her euDKA was thought to be precipitated by her respiratory tract infection in the setting of SGLT2 inhibitor use . The patient was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , 12 units of insulin lispro with meals , and metformin 1000 mg two times a day . It was determined that all SGLT2 inhibitors should be discontinued indefinitely . She had close follow-up with endocrinology post discharge.""").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.relation.clinical").predict("""A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus ( T2DM ), +one prior episode of HTG-induced pancreatitis three years prior to presentation, associated with an acute hepatitis , and obesity with a body mass index ( BMI ) of 33.5 kg/m2 , presented with a one-week history of polyuria , polydipsia , poor appetite , and vomiting . Two weeks prior to presentation , she was treated with a five-day course of amoxicillin for a respiratory tract infection . She was on metformin , glipizide , and dapagliflozin for T2DM and atorvastatin and gemfibrozil for HTG . She had been on dapagliflozin for six months at the time of presentation. Physical examination on presentation was significant for dry oral mucosa ; significantly , her abdominal examination was benign with no tenderness , guarding , or rigidity . Pertinent laboratory findings on admission were : serum glucose 111 mg/dl , bicarbonate 18 mmol/l , anion gap 20 , creatinine 0.4 mg/dL , triglycerides 508 mg/dL , total cholesterol 122 mg/dL , glycated hemoglobin ( HbA1c ) 10% , and venous pH 7.27 . Serum lipase was normal at 43 U/L . Serum acetone levels could not be assessed as blood samples kept hemolyzing due to significant lipemia . The patient was initially admitted for starvation ketosis , as she reported poor oral intake for three days prior to admission . However , serum chemistry obtained six hours after presentation revealed her glucose was 186 mg/dL , the anion gap was still elevated at 21 , serum bicarbonate was 16 mmol/L , triglyceride level peaked at 2050 mg/dL , and lipase was 52 U/L . The β-hydroxybutyrate level was obtained and found to be elevated at 5.29 mmol/L - the original sample was centrifuged and the chylomicron layer removed prior to analysis due to interference from turbidity caused by lipemia again . The patient was treated with an insulin drip for euDKA and HTG with a reduction in the anion gap to 13 and triglycerides to 1400 mg/dL , within 24 hours . Her euDKA was thought to be precipitated by her respiratory tract infection in the setting of SGLT2 inhibitor use . The patient was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , 12 units of insulin lispro with meals , and metformin 1000 mg two times a day . It was determined that all SGLT2 inhibitors should be discontinued indefinitely . +She had close follow-up with endocrinology post discharge . +""") +``` +
+ +## Results + +```bash ++--------+---------+-------------+-----------+--------------------+---------+-------------+-----------+--------------------+----------+ +|relation| entity1|entity1_begin|entity1_end| chunk1| entity2|entity2_begin|entity2_end| chunk2|confidence| ++--------+---------+-------------+-----------+--------------------+---------+-------------+-----------+--------------------+----------+ +| TrAP|TREATMENT| 512| 522| amoxicillin| PROBLEM| 528| 556|a respiratory tra...|0.99796957| +| TrAP|TREATMENT| 571| 579| metformin| PROBLEM| 617| 620| T2DM|0.99757993| +| TrAP|TREATMENT| 599| 611| dapagliflozin| PROBLEM| 659| 661| HTG| 0.996036| +| TrAP| PROBLEM| 617| 620| T2DM|TREATMENT| 626| 637| atorvastatin| 0.9693424| +| TrAP| PROBLEM| 617| 620| T2DM|TREATMENT| 643| 653| gemfibrozil|0.99460286| +| TeRP| TEST| 739| 758|Physical examination| PROBLEM| 796| 810| dry oral mucosa|0.99775106| +| TeRP| TEST| 830| 854|her abdominal exa...| PROBLEM| 875| 884| tenderness|0.99272937| +| TeRP| TEST| 830| 854|her abdominal exa...| PROBLEM| 888| 895| guarding| 0.9840321| +| TeRP| TEST| 830| 854|her abdominal exa...| PROBLEM| 902| 909| rigidity| 0.9883966| +| TeRP| TEST| 1246| 1258| blood samples| PROBLEM| 1265| 1274| hemolyzing| 0.9534202| +| TeRP| TEST| 1507| 1517| her glucose| PROBLEM| 1553| 1566| still elevated| 0.9464761| +| TeRP| PROBLEM| 1553| 1566| still elevated| TEST| 1576| 1592| serum bicarbonate| 0.9428323| +| TeRP| PROBLEM| 1553| 1566| still elevated| TEST| 1656| 1661| lipase| 0.9558198| +| TeRP| PROBLEM| 1553| 1566| still elevated| TEST| 1670| 1672| U/L| 0.9214444| +| TeRP| TEST| 1676| 1702|The β-hydroxybuty...| PROBLEM| 1733| 1740| elevated| 0.9863963| +| TrAP|TREATMENT| 1937| 1951| an insulin drip| PROBLEM| 1957| 1961| euDKA| 0.9852455| +| O| PROBLEM| 1957| 1961| euDKA| TEST| 1991| 2003| the anion gap|0.94141793| +| O| PROBLEM| 1957| 1961| euDKA| TEST| 2015| 2027| triglycerides| 0.9622529| ++--------+---------+-------------+-----------+--------------------+---------+-------------+-----------+--------------------+----------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|redl_clinical_biobert| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|402.0 MB| + +## Benchmarking + +```bash +Relation Recall Precision F1 Support +PIP 0.859 0.878 0.869 1435 +TeCP 0.629 0.782 0.697 337 +TeRP 0.903 0.929 0.916 2034 +TrAP 0.872 0.866 0.869 1693 +TrCP 0.641 0.677 0.659 340 +TrIP 0.517 0.796 0.627 151 +TrNAP 0.402 0.672 0.503 112 +TrWP 0.257 0.824 0.392 109 +Avg. 0.635 0.803 0.691 - +``` \ No newline at end of file From ee76bdb17048288a953e9ccf851a67135e17d909 Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Thu, 12 Jan 2023 20:34:25 +0700 Subject: [PATCH 49/57] 2023-01-11-ner_oncology_unspecific_posology_healthcare_en (#13328) * Add model 2023-01-11-ner_oncology_unspecific_posology_healthcare_en * Update 2023-01-11-ner_oncology_unspecific_posology_healthcare_en.md * Add model 2023-01-11-ner_oncology_biomarker_healthcare_en * Add model 2023-01-11-ner_oncology_anatomy_general_healthcare_en Co-authored-by: Meryem1425 Co-authored-by: Vildan <64216738+Meryem1425@users.noreply.github.com> --- ..._oncology_anatomy_general_healthcare_en.md | 154 ++++++++++++++++ ...11-ner_oncology_biomarker_healthcare_en.md | 168 ++++++++++++++++++ ...ology_unspecific_posology_healthcare_en.md | 157 ++++++++++++++++ 3 files changed, 479 insertions(+) create mode 100644 docs/_posts/Meryem1425/2023-01-11-ner_oncology_anatomy_general_healthcare_en.md create mode 100644 docs/_posts/Meryem1425/2023-01-11-ner_oncology_biomarker_healthcare_en.md create mode 100644 docs/_posts/Meryem1425/2023-01-11-ner_oncology_unspecific_posology_healthcare_en.md diff --git a/docs/_posts/Meryem1425/2023-01-11-ner_oncology_anatomy_general_healthcare_en.md b/docs/_posts/Meryem1425/2023-01-11-ner_oncology_anatomy_general_healthcare_en.md new file mode 100644 index 00000000000000..56064df4c74b32 --- /dev/null +++ b/docs/_posts/Meryem1425/2023-01-11-ner_oncology_anatomy_general_healthcare_en.md @@ -0,0 +1,154 @@ +--- +layout: model +title: Extract Anatomical Entities from Oncology Texts +author: John Snow Labs +name: ner_oncology_anatomy_general_healthcare +date: 2023-01-11 +tags: [licensed, clinical, oncology, en, ner, anatomy] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts anatomical entities using an unspecific label. + +## Predicted Entities + +`Anatomical_Site`, `Direction` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/NER_ONCOLOGY_CLINICAL/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_anatomy_general_healthcare_en_4.2.4_3.0_1673477824696.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel\ + .pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel()\ + .pretrained("embeddings_healthcare_100d", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel\ + .pretrained("ner_oncology_anatomy_general_healthcare", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverterInternal() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["The patient presented a mass in her left breast, and a possible metastasis in her lungs and in her liver."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel + .pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel() + .pretrained("embeddings_healthcare_100d", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_anatomy_general_healthcare", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("The patient presented a mass in her left breast, and a possible metastasis in her lungs and in her liver.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:--------|:----------------| +| left | Direction | +| breast | Anatomical_Site | +| lungs | Anatomical_Site | +| liver | Anatomical_Site | + +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_anatomy_general_healthcare| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|34.0 MB| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 +Anatomical_Site 1439 235 333 1772 0.86 0.81 0.84 + Direction 434 92 65 499 0.83 0.87 0.85 + macro-avg 1873 327 398 2271 0.84 0.84 0.84 + micro-avg 1873 327 398 2271 0.85 0.82 0.84 +``` \ No newline at end of file diff --git a/docs/_posts/Meryem1425/2023-01-11-ner_oncology_biomarker_healthcare_en.md b/docs/_posts/Meryem1425/2023-01-11-ner_oncology_biomarker_healthcare_en.md new file mode 100644 index 00000000000000..d94ddcaca9b1b7 --- /dev/null +++ b/docs/_posts/Meryem1425/2023-01-11-ner_oncology_biomarker_healthcare_en.md @@ -0,0 +1,168 @@ +--- +layout: model +title: Extract Biomarkers and Their Results +author: John Snow Labs +name: ner_oncology_biomarker_healthcare +date: 2023-01-11 +tags: [licensed, clinical, oncology, en, ner, biomarker] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts mentions of biomarkers and biomarker results from oncology texts. + +## Predicted Entities + +`Biomarker_Result`, `Biomarker` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/NER_ONCOLOGY_CLINICAL/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_biomarker_healthcare_en_4.2.4_3.0_1673477151495.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel\ + .pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel()\ + .pretrained("embeddings_healthcare_100d", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel\ + .pretrained("ner_oncology_biomarker_healthcare", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverterInternal() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["The results of immunohistochemical examination showed that she tested negative for CK7, synaptophysin (Syn), chromogranin A (CgA), Muc5AC, human epidermal growth factor receptor-2 (HER2), and Muc6; positive for CK20, Muc1, Muc2, E-cadherin, and p53; the Ki-67 index was about 87%."]]).toDF("text") + +result = pipeline.fit(data).transform(data) + +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel + .pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel() + .pretrained("embeddings_healthcare_100d", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_biomarker_healthcare", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("The results of immunohistochemical examination showed that she tested negative for CK7, synaptophysin (Syn), chromogranin A (CgA), Muc5AC, human epidermal growth factor receptor-2 (HER2), and Muc6; positive for CK20, Muc1, Muc2, E-cadherin, and p53; the Ki-67 index was about 87%.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:-----------------------------------------|:-----------------| +| negative | Biomarker_Result | +| CK7 | Biomarker | +| synaptophysin | Biomarker | +| Syn | Biomarker | +| chromogranin A | Biomarker | +| CgA | Biomarker | +| Muc5AC | Biomarker | +| human epidermal growth factor receptor-2 | Biomarker | +| HER2 | Biomarker | +| Muc6 | Biomarker | +| positive | Biomarker_Result | +| CK20 | Biomarker | +| Muc1 | Biomarker | +| Muc2 | Biomarker | +| E-cadherin | Biomarker | +| p53 | Biomarker | +| Ki-67 index | Biomarker | +| 87% | Biomarker_Result | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_biomarker_healthcare| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|33.8 MB| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 +Biomarker_Result 519 78 62 581 0.87 0.89 0.88 + Biomarker 828 51 98 926 0.94 0.89 0.92 + macro-avg 1347 129 160 1507 0.91 0.89 0.90 + micro-avg 1347 129 160 1507 0.91 0.89 0.90 +``` \ No newline at end of file diff --git a/docs/_posts/Meryem1425/2023-01-11-ner_oncology_unspecific_posology_healthcare_en.md b/docs/_posts/Meryem1425/2023-01-11-ner_oncology_unspecific_posology_healthcare_en.md new file mode 100644 index 00000000000000..883ae01871ff4c --- /dev/null +++ b/docs/_posts/Meryem1425/2023-01-11-ner_oncology_unspecific_posology_healthcare_en.md @@ -0,0 +1,157 @@ +--- +layout: model +title: Extract Cancer Therapies and Posology Information +author: John Snow Labs +name: ner_oncology_unspecific_posology_healthcare +date: 2023-01-11 +tags: [licensed, clinical, oncology, en, ner, treatment, posology] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts mentions of treatments and posology information using unspecific labels (low granularity). + +## Predicted Entities + +`Posology_Information`, `Cancer_Therapy` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/NER_ONCOLOGY_CLINICAL/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_oncology_unspecific_posology_healthcare_en_4.2.4_3.0_1673475870938.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel\ + .pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel()\ + .pretrained("embeddings_healthcare_100d", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel\ + .pretrained("ner_oncology_unspecific_posology_healthcare", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverterInternal() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["The patient underwent a regimen consisting of adriamycin (60 mg/m2) and cyclophosphamide (600 mg/m2) over six courses. She is currently receiving his second cycle of chemotherapy and is in good overall condition."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel + .pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel() + .pretrained("embeddings_healthcare_100d", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_unspecific_posology_healthcare", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("The patient underwent a regimen consisting of adriamycin (60 mg/m2) and cyclophosphamide (600 mg/m2) over six courses. She is currently receiving his second cycle of chemotherapy and is in good overall condition.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| chunk | ner_label | +|:-----------------|:---------------------| +| adriamycin | Cancer_Therapy | +| 60 mg/m2 | Posology_Information | +| cyclophosphamide | Cancer_Therapy | +| 600 mg/m2 | Posology_Information | +| over six courses | Posology_Information | +| second cycle | Posology_Information | +| chemotherapy | Cancer_Therapy | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_oncology_unspecific_posology_healthcare| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|33.8 MB| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 +Posology_Information 1435 102 210 1645 0.93 0.87 0.90 + Cancer_Therapy 1281 116 125 1406 0.92 0.91 0.91 + macro-avg 2716 218 335 3051 0.93 0.89 0.91 + micro-avg 2716 218 335 3051 0.93 0.89 0.91 +``` From bdf8d16adcf693dec8799708b8aa6cc71a6e45ff Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Sat, 14 Jan 2023 21:43:26 +0700 Subject: [PATCH 50/57] 2023-01-14-genericclassifier_sdoh_tobacco_usage_sbiobert_cased_mli_en (#13348) --- ...ohol_usage_binary_sbiobert_cased_mli_en.md | 136 +++++++++++++++++ ...doh_alcohol_usage_sbiobert_cased_mli_en.md | 143 +++++++++++++++++ ..._economics_binary_sbiobert_cased_mli_en.md | 133 ++++++++++++++++ ...ance_usage_binary_sbiobert_cased_mli_en.md | 138 +++++++++++++++++ ...doh_tobacco_usage_sbiobert_cased_mli_en.md | 144 ++++++++++++++++++ 5 files changed, 694 insertions(+) create mode 100644 docs/_posts/Damla-Gurbaz/2023-01-14-genericclassifier_sdoh_alcohol_usage_binary_sbiobert_cased_mli_en.md create mode 100644 docs/_posts/Damla-Gurbaz/2023-01-14-genericclassifier_sdoh_alcohol_usage_sbiobert_cased_mli_en.md create mode 100644 docs/_posts/Damla-Gurbaz/2023-01-14-genericclassifier_sdoh_economics_binary_sbiobert_cased_mli_en.md create mode 100644 docs/_posts/Damla-Gurbaz/2023-01-14-genericclassifier_sdoh_substance_usage_binary_sbiobert_cased_mli_en.md create mode 100644 docs/_posts/Damla-Gurbaz/2023-01-14-genericclassifier_sdoh_tobacco_usage_sbiobert_cased_mli_en.md diff --git a/docs/_posts/Damla-Gurbaz/2023-01-14-genericclassifier_sdoh_alcohol_usage_binary_sbiobert_cased_mli_en.md b/docs/_posts/Damla-Gurbaz/2023-01-14-genericclassifier_sdoh_alcohol_usage_binary_sbiobert_cased_mli_en.md new file mode 100644 index 00000000000000..bb26b188ccb726 --- /dev/null +++ b/docs/_posts/Damla-Gurbaz/2023-01-14-genericclassifier_sdoh_alcohol_usage_binary_sbiobert_cased_mli_en.md @@ -0,0 +1,136 @@ +--- +layout: model +title: SDOH Alcohol Usege For Binary Classification +author: John Snow Labs +name: genericclassifier_sdoh_alcohol_usage_binary_sbiobert_cased_mli +date: 2023-01-14 +tags: [en, licensed, generic_classifier, sdoh, alcohol, clinical] +task: Text Classification +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This Generic Classifier model is intended for detecting alcohol use in clinical notes and trained by using GenericClassifierApproach annotator. `Present:` if the patient was a current consumer of alcohol or the patient was a consumer in the past and had quit. `Never:` if the patient had never consumed alcohol. `None: ` if there was no related text. + +## Predicted Entities + +`Present`, `Never`, `None` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/genericclassifier_sdoh_alcohol_usage_binary_sbiobert_cased_mli_en_4.2.4_3.0_1673699002618.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_embeddings = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", 'en','clinical/models')\ + .setInputCols(["document"])\ + .setOutputCol("sentence_embeddings") + +features_asm = FeaturesAssembler()\ + .setInputCols(["sentence_embeddings"])\ + .setOutputCol("features") + +generic_classifier = GenericClassifierModel.pretrained("genericclassifier_sdoh_alcohol_usage_binary_sbiobert_cased_mli", 'en', 'clinical/models')\ + .setInputCols(["features"])\ + .setOutputCol("class") + +pipeline = Pipeline(stages=[ + document_assembler, + sentence_embeddings, + features_asm, + generic_classifier +]) + +text_list = ["Retired schoolteacher, now substitutes. Lives with wife in location 1439. Has a 27 yo son and a 25 yo daughter. He uses alcohol and cigarettes", + "Employee in neuro departmentin at the Center Hospital 18. Widower since 2001. Current smoker since 20 years. No EtOH or illicits.", + "Patient smoked 4 ppd x 37 years, quitting 22 years ago. He is widowed, lives alone, has three children."] + +df = spark.createDataFrame(text_list, StringType()).toDF("text") + +result = pipeline.fit(df).transform(df) + +result.select("text", "class.result").show(truncate=100) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_embeddings = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", "en", "clinical/models") + .setInputCols("document") + .setOutputCol("sentence_embeddings") + +val features_asm = new FeaturesAssembler() + .setInputCols("sentence_embeddings") + .setOutputCol("features") + +val generic_classifier = GenericClassifierModel.pretrained("genericclassifier_sdoh_alcohol_usage_binary_sbiobert_cased_mli", "en", "clinical/models") + .setInputCols("features") + .setOutputCol("class") + +val pipeline = new PipelineModel().setStages(Array( + document_assembler, + sentence_embeddings, + features_asm, + generic_classifier)) + +val data = Seq("Retired schoolteacher, now substitutes. Lives with wife in location 1439. Has a 27 yo son and a 25 yo daughter. He uses alcohol and cigarettes.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++----------------------------------------------------------------------------------------------------+---------+ +| text| result| ++----------------------------------------------------------------------------------------------------+---------+ +|Retired schoolteacher, now substitutes. Lives with wife in location 1439. Has a 27 yo son and a 2...|[Present]| +|Employee in neuro departmentin at the Center Hospital 18. Widower since 2001. Current smoker sinc...| [Never]| +|Patient smoked 4 ppd x 37 years, quitting 22 years ago. He is widowed, lives alone, has three chi...| [None]| ++----------------------------------------------------------------------------------------------------+---------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|genericclassifier_sdoh_alcohol_usage_binary_sbiobert_cased_mli| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[features]| +|Output Labels:|[prediction]| +|Language:|en| +|Size:|3.4 MB| + +## Benchmarking + +```bash + label precision recall f1-score support + Never 0.85 0.86 0.85 523 + None 0.81 0.82 0.81 341 + Present 0.88 0.86 0.87 516 + accuracy - - 0.85 1380 + macro-avg 0.85 0.85 0.85 1380 +weighted-avg 0.85 0.85 0.85 1380 +``` \ No newline at end of file diff --git a/docs/_posts/Damla-Gurbaz/2023-01-14-genericclassifier_sdoh_alcohol_usage_sbiobert_cased_mli_en.md b/docs/_posts/Damla-Gurbaz/2023-01-14-genericclassifier_sdoh_alcohol_usage_sbiobert_cased_mli_en.md new file mode 100644 index 00000000000000..68b2790cb0287a --- /dev/null +++ b/docs/_posts/Damla-Gurbaz/2023-01-14-genericclassifier_sdoh_alcohol_usage_sbiobert_cased_mli_en.md @@ -0,0 +1,143 @@ +--- +layout: model +title: SDOH Alcohol Usege For Classification +author: John Snow Labs +name: genericclassifier_sdoh_alcohol_usage_sbiobert_cased_mli +date: 2023-01-14 +tags: [en, licensed, generic_classifier, sdoh, alcohol, clinical] +task: Text Classification +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This Generic Classifier model is intended for detecting alcohol use in clinical notes and trained by using GenericClassifierApproach annotator. `Present:` if the patient was a current consumer of alcohol. `Past:` the patient was a consumer in the past and had quit. `Never:` if the patient had never consumed alcohol. `None: ` if there was no related text. + +## Predicted Entities + +`Present`, `Past`, `Never`, `None` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/genericclassifier_sdoh_alcohol_usage_sbiobert_cased_mli_en_4.2.4_3.0_1673698550774.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_embeddings = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", 'en','clinical/models')\ + .setInputCols(["document"])\ + .setOutputCol("sentence_embeddings") + +features_asm = FeaturesAssembler()\ + .setInputCols(["sentence_embeddings"])\ + .setOutputCol("features") + +generic_classifier = GenericClassifierModel.pretrained("genericclassifier_sdoh_alcohol_usage_sbiobert_cased_mli", 'en', 'clinical/models')\ + .setInputCols(["features"])\ + .setOutputCol("class") + +pipeline = Pipeline(stages=[ + document_assembler, + sentence_embeddings, + features_asm, + generic_classifier +]) + +text_list = ["Retired schoolteacher, now substitutes. Lives with wife in location 1439. Has a 27 yo son and a 25 yo daughter. He uses alcohol and cigarettes", + "The patient quit smoking approximately two years ago with an approximately a 40 pack year history, mostly cigar use. He also reports 'heavy alcohol use', quit 15 months ago.", + "Employee in neuro departmentin at the Center Hospital 18. Widower since 2001. Current smoker since 20 years. No EtOH or illicits.", + "Patient smoked 4 ppd x 37 years, quitting 22 years ago. He is widowed, lives alone, has three children."] + +df = spark.createDataFrame(text_list, StringType()).toDF("text") + +result = pipeline.fit(df).transform(df) + +result.select("text", "class.result").show(truncate=100) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_embeddings = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", "en", "clinical/models") + .setInputCols("document") + .setOutputCol("sentence_embeddings") + +val features_asm = new FeaturesAssembler() + .setInputCols("sentence_embeddings") + .setOutputCol("features") + +val generic_classifier = GenericClassifierModel.pretrained("genericclassifier_sdoh_alcohol_usage_sbiobert_cased_mli", "en", "clinical/models") + .setInputCols("features") + .setOutputCol("class") + +val pipeline = new PipelineModel().setStages(Array( + document_assembler, + sentence_embeddings, + features_asm, + generic_classifier)) + +val data = Seq("Retired schoolteacher, now substitutes. Lives with wife in location 1439. Has a 27 yo son and a 25 yo daughter. He uses alcohol and cigarettes.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash + ++----------------------------------------------------------------------------------------------------+---------+ +| text| result| ++----------------------------------------------------------------------------------------------------+---------+ +|Retired schoolteacher, now substitutes. Lives with wife in location 1439. Has a 27 yo son and a 2...|[Present]| +|The patient quit smoking approximately two years ago with an approximately a 40 pack year history...| [Past]| +|Employee in neuro departmentin at the Center Hospital 18. Widower since 2001. Current smoker sinc...| [Never]| +|Patient smoked 4 ppd x 37 years, quitting 22 years ago. He is widowed, lives alone, has three chi...| [None]| ++----------------------------------------------------------------------------------------------------+---------+ + +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|genericclassifier_sdoh_alcohol_usage_sbiobert_cased_mli| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[features]| +|Output Labels:|[prediction]| +|Language:|en| +|Size:|3.5 MB| + +## Benchmarking + +```bash + + label precision recall f1-score support + Never 0.84 0.87 0.85 523 + None 0.83 0.74 0.81 341 + Past 0.51 0.35 0.50 98 + Present 0.74 0.83 0.79 418 + accuracy - - 0.79 1380 + macro-avg 0.73 0.70 0.71 1380 +weighted-avg 0.78 0.79 0.78 1380 + +``` \ No newline at end of file diff --git a/docs/_posts/Damla-Gurbaz/2023-01-14-genericclassifier_sdoh_economics_binary_sbiobert_cased_mli_en.md b/docs/_posts/Damla-Gurbaz/2023-01-14-genericclassifier_sdoh_economics_binary_sbiobert_cased_mli_en.md new file mode 100644 index 00000000000000..ff05934d42593d --- /dev/null +++ b/docs/_posts/Damla-Gurbaz/2023-01-14-genericclassifier_sdoh_economics_binary_sbiobert_cased_mli_en.md @@ -0,0 +1,133 @@ +--- +layout: model +title: SDOH Economics Status For Binary Classification +author: John Snow Labs +name: genericclassifier_sdoh_economics_binary_sbiobert_cased_mli +date: 2023-01-14 +tags: [en, licensed, generic_classifier, sdoh, economics, clinical] +task: Text Classification +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model classifies related to social economics status in the clinical documents and trained by using GenericClassifierApproach annotator. `True:` if the patient was currently employed or unemployed. `False:` if there was no related passage. + +## Predicted Entities + +`True`, `False` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/genericclassifier_sdoh_economics_binary_sbiobert_cased_mli_en_4.2.4_3.0_1673699299086.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_embeddings = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", 'en','clinical/models')\ + .setInputCols(["document"])\ + .setOutputCol("sentence_embeddings") + +features_asm = FeaturesAssembler()\ + .setInputCols(["sentence_embeddings"])\ + .setOutputCol("features") + +generic_classifier = GenericClassifierModel.pretrained("genericclassifier_sdoh_economics_binary_sbiobert_cased_mli", 'en', 'clinical/models')\ + .setInputCols(["features"])\ + .setOutputCol("class") + +pipeline = Pipeline(stages=[ + document_assembler, + sentence_embeddings, + features_asm, + generic_classifier +]) + +text_list = ["Retired schoolteacher, now substitutes. Lives with wife in location 1439. Has a 27 yo son and a 25 yo daughter. He uses alcohol and cigarettes", + "The patient quit smoking approximately two years ago with an approximately a 40 pack year history, mostly cigar use. He also reports 'heavy alcohol use', quit 15 months ago."] + +df = spark.createDataFrame(text_list, StringType()).toDF("text") + +result = pipeline.fit(df).transform(df) + +result.select("text", "class.result").show(truncate=100) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_embeddings = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", "en", "clinical/models") + .setInputCols("document") + .setOutputCol("sentence_embeddings") + +val features_asm = new FeaturesAssembler() + .setInputCols("sentence_embeddings") + .setOutputCol("features") + +val generic_classifier = GenericClassifierModel.pretrained("genericclassifier_sdoh_economics_binary_sbiobert_cased_mli", "en", "clinical/models") + .setInputCols("features") + .setOutputCol("class") + +val pipeline = new PipelineModel().setStages(Array( + document_assembler, + sentence_embeddings, + features_asm, + generic_classifier)) + +val data = Seq("Retired schoolteacher, now substitutes. Lives with wife in location 1439. Has a 27 yo son and a 25 yo daughter. He uses alcohol and cigarettes.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++----------------------------------------------------------------------------------------------------+-------+ +| text| result| ++----------------------------------------------------------------------------------------------------+-------+ +|Retired schoolteacher, now substitutes. Lives with wife in location 1439. Has a 27 yo son and a 2...| [True]| +|The patient quit smoking approximately two years ago with an approximately a 40 pack year history...|[False]| ++----------------------------------------------------------------------------------------------------+-------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|genericclassifier_sdoh_economics_binary_sbiobert_cased_mli| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[features]| +|Output Labels:|[prediction]| +|Language:|en| +|Size:|3.5 MB| + +## Benchmarking + +```bash + label precision recall f1-score support + False 0.93 0.85 0.89 894 + True 0.79 0.90 0.84 562 + accuracy - - 0.87 1456 + macro-avg 0.86 0.87 0.86 1456 +weighted-avg 0.87 0.87 0.87 1456 +``` \ No newline at end of file diff --git a/docs/_posts/Damla-Gurbaz/2023-01-14-genericclassifier_sdoh_substance_usage_binary_sbiobert_cased_mli_en.md b/docs/_posts/Damla-Gurbaz/2023-01-14-genericclassifier_sdoh_substance_usage_binary_sbiobert_cased_mli_en.md new file mode 100644 index 00000000000000..8b65f56676014f --- /dev/null +++ b/docs/_posts/Damla-Gurbaz/2023-01-14-genericclassifier_sdoh_substance_usage_binary_sbiobert_cased_mli_en.md @@ -0,0 +1,138 @@ +--- +layout: model +title: SDOH Substance Usage For Binary Classification +author: John Snow Labs +name: genericclassifier_sdoh_substance_usage_binary_sbiobert_cased_mli +date: 2023-01-14 +tags: [en, licensed, generic_classifier, sdoh, substance, clinical] +task: Text Classification +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This Generic Classifier model is intended for detecting substance use in clinical notes and trained by using GenericClassifierApproach annotator. `Present:` if the patient was a current consumer of alcohol or the patient was a consumer in the past and had quit or if the patient had never consumed alcohol. `None:` if there was no related text. + +## Predicted Entities + +`Present`, `None` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/genericclassifier_sdoh_substance_usage_binary_sbiobert_cased_mli_en_4.2.4_3.0_1673697973649.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_embeddings = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", 'en','clinical/models')\ + .setInputCols(["document"])\ + .setOutputCol("sentence_embeddings") + +features_asm = FeaturesAssembler()\ + .setInputCols(["sentence_embeddings"])\ + .setOutputCol("features") + +generic_classifier = GenericClassifierModel.pretrained("genericclassifier_sdoh_substance_usage_binary_sbiobert_cased_mli", 'en', 'clinical/models')\ + .setInputCols(["features"])\ + .setOutputCol("class") + +pipeline = Pipeline(stages=[ + document_assembler, + sentence_embeddings, + features_asm, + generic_classifier +]) + +text_list = ["Lives in apartment with 16-year-old daughter. Denies EtOH use currently although reports occasional use in past. Utox on admission positive for opiate (on as rx) as well as cocaine. 4-6 cigarettes a day on and off for 10 years. Denies h/o illicit drug use besides marijuana although admitted to cocaine use after being found to have urine positive for cocaine.", + "The patient quit smoking approximately two years ago with an approximately a 40 pack year history, mostly cigar use. He also reports 'heavy alcohol use', quit 15 months ago."] + + +df = spark.createDataFrame(text_list, StringType()).toDF("text") + +result = pipeline.fit(df).transform(df) + +result.select("text", "class.result").show(truncate=100) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_embeddings = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", "en", "clinical/models") + .setInputCols("document") + .setOutputCol("sentence_embeddings") + +val features_asm = new FeaturesAssembler() + .setInputCols("sentence_embeddings") + .setOutputCol("features") + +val generic_classifier = GenericClassifierModel.pretrained("genericclassifier_sdoh_substance_usage_binary_sbiobert_cased_mli", "en", "clinical/models") + .setInputCols("features") + .setOutputCol("class") + +val pipeline = new PipelineModel().setStages(Array( + document_assembler, + sentence_embeddings, + features_asm, + generic_classifier)) + +val data = Seq("The patient quit smoking approximately two years ago with an approximately a 40 pack year history, mostly cigar use. He also reports 'heavy alcohol use', quit 15 months ago.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash + ++----------------------------------------------------------------------------------------------------+---------+ +| text| result| ++----------------------------------------------------------------------------------------------------+---------+ +|Lives in apartment with 16-year-old daughter. Denies EtOH use currently although reports occasion...|[Present]| +|The patient quit smoking approximately two years ago with an approximately a 40 pack year history...| [None]| ++----------------------------------------------------------------------------------------------------+---------+ + +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|genericclassifier_sdoh_substance_usage_binary_sbiobert_cased_mli| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[features]| +|Output Labels:|[prediction]| +|Language:|en| +|Size:|3.4 MB| + +## Benchmarking + +```bash + + label precision recall f1-score support + None 0.91 0.83 0.87 898 + Present 0.76 0.87 0.81 540 + accuracy - - 0.85 1438 + macro-avg 0.83 0.85 0.84 1438 +weighted-avg 0.85 0.85 0.85 1438 + +``` \ No newline at end of file diff --git a/docs/_posts/Damla-Gurbaz/2023-01-14-genericclassifier_sdoh_tobacco_usage_sbiobert_cased_mli_en.md b/docs/_posts/Damla-Gurbaz/2023-01-14-genericclassifier_sdoh_tobacco_usage_sbiobert_cased_mli_en.md new file mode 100644 index 00000000000000..2cf543c976819d --- /dev/null +++ b/docs/_posts/Damla-Gurbaz/2023-01-14-genericclassifier_sdoh_tobacco_usage_sbiobert_cased_mli_en.md @@ -0,0 +1,144 @@ +--- +layout: model +title: SDOH Tobacco Usage For Classification +author: John Snow Labs +name: genericclassifier_sdoh_tobacco_usage_sbiobert_cased_mli +date: 2023-01-14 +tags: [en, licensed, generic_classifier, sdoh, tobacco, clinical] +task: Text Classification +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This Generic Classifier model is intended for detecting tobacco use in clinical notes and trained by using GenericClassifierApproach annotator. `Present:` if the patient was a current consumer of alcohol. `Past:` the patient was a consumer in the past and had quit. `Never:` if the patient had never consumed alcohol. `None: ` if there was no related text. + +## Predicted Entities + +`Present`, `Past`, `Never`, `None` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/genericclassifier_sdoh_tobacco_usage_sbiobert_cased_mli_en_4.2.4_3.0_1673697468673.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_embeddings = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", 'en','clinical/models')\ + .setInputCols(["document"])\ + .setOutputCol("sentence_embeddings") + +features_asm = FeaturesAssembler()\ + .setInputCols(["sentence_embeddings"])\ + .setOutputCol("features") + +generic_classifier = GenericClassifierModel.pretrained("genericclassifier_sdoh_tobacco_usage_sbiobert_cased_mli", 'en', 'clinical/models')\ + .setInputCols(["features"])\ + .setOutputCol("class") + +pipeline = Pipeline(stages=[ + document_assembler, + sentence_embeddings, + features_asm, + generic_classifier +]) + +text_list = ["Retired schoolteacher, now substitutes. Lives with wife in location 1439. Has a 27 yo son and a 25 yo daughter. He uses alcohol and cigarettes", + "The patient quit smoking approximately two years ago with an approximately a 40 pack year history, mostly cigar use. He also reports 'heavy alcohol use', quit 15 months ago.", + "The patient denies any history of smoking or alcohol abuse. She lives with her one daughter.", + "She was previously employed as a hairdresser, though says she hasnt worked in 4 years. Not reported by patient, but there is apparently a history of alochol abuse."] + +df = spark.createDataFrame(text_list, StringType()).toDF("text") + +result = pipeline.fit(df).transform(df) + +result.select("text", "class.result").show(truncate=100) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_embeddings = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", "en", "clinical/models") + .setInputCols("document") + .setOutputCol("sentence_embeddings") + +val features_asm = new FeaturesAssembler() + .setInputCols("sentence_embeddings") + .setOutputCol("features") + +val generic_classifier = GenericClassifierModel.pretrained("genericclassifier_sdoh_tobacco_usage_sbiobert_cased_mli", "en", "clinical/models") + .setInputCols("features") + .setOutputCol("class") + +val pipeline = new PipelineModel().setStages(Array( + document_assembler, + sentence_embeddings, + features_asm, + generic_classifier)) + +val data = Seq("Retired schoolteacher, now substitutes. Lives with wife in location 1439. Has a 27 yo son and a 25 yo daughter. He uses alcohol and cigarettes.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash + ++----------------------------------------------------------------------------------------------------+---------+ +| text| result| ++----------------------------------------------------------------------------------------------------+---------+ +|Retired schoolteacher, now substitutes. Lives with wife in location 1439. Has a 27 yo son and a 2...|[Present]| +|The patient quit smoking approximately two years ago with an approximately a 40 pack year history...| [Past]| +| The patient denies any history of smoking or alcohol abuse. She lives with her one daughter.| [Never]| +|She was previously employed as a hairdresser, though says she hasnt worked in 4 years. Not report...| [None]| ++----------------------------------------------------------------------------------------------------+---------+ + +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|genericclassifier_sdoh_tobacco_usage_sbiobert_cased_mli| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[features]| +|Output Labels:|[prediction]| +|Language:|en| +|Size:|3.4 MB| + +## Benchmarking + +```bash + + label precision recall f1-score support + Never 0.89 0.90 0.90 487 + None 0.86 0.78 0.82 269 + Past 0.87 0.79 0.83 415 + Present 0.63 0.82 0.71 203 + accuracy - - 0.83 1374 + macro-avg 0.81 0.82 0.81 1374 +weighted-avg 0.84 0.83 0.83 1374 + +``` From 28e2d76a9b62ee06bef1de8b26b96625670662c1 Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Sun, 15 Jan 2023 22:37:32 +0700 Subject: [PATCH 51/57] 2023-01-14-redl_ade_biobert_en (#13349) * Add model 2023-01-14-redl_ade_biobert_en * Add model 2023-01-14-redl_bodypart_direction_biobert_en * Add model 2023-01-14-redl_bodypart_problem_biobert_en * Update 2023-01-14-redl_ade_biobert_en.md * Update 2023-01-14-redl_bodypart_direction_biobert_en.md * Update 2023-01-14-redl_bodypart_problem_biobert_en.md * Add model 2023-01-14-redl_bodypart_procedure_test_biobert_en * Add model 2023-01-14-redl_chemprot_biobert_en * Update 2023-01-14-redl_chemprot_biobert_en.md * Add model 2023-01-14-redl_clinical_biobert_en * Add model 2023-01-14-redl_date_clinical_biobert_en * Add model 2023-01-14-redl_drug_drug_interaction_biobert_en * Add model 2023-01-14-redl_drugprot_biobert_en * Add model 2023-01-14-redl_human_phenotype_gene_biobert_en * Add model 2023-01-15-redl_nihss_biobert_en * Add model 2023-01-15-redl_oncology_biobert_wip_en * Add model 2023-01-15-redl_oncology_biomarker_result_biobert_wip_en * Add model 2023-01-15-redl_oncology_granular_biobert_wip_en * Add model 2023-01-15-redl_oncology_location_biobert_wip_en * Add model 2023-01-15-redl_oncology_size_biobert_wip_en * Add model 2023-01-15-redl_oncology_temporal_biobert_wip_en * Update 2023-01-14-redl_bodypart_direction_biobert_en.md * Update 2023-01-14-redl_date_clinical_biobert_en.md * Update 2023-01-14-redl_date_clinical_biobert_en.md * Update 2023-01-14-redl_drug_drug_interaction_biobert_en.md * Update 2023-01-15-redl_oncology_biomarker_result_biobert_wip_en.md * Update 2023-01-15-redl_oncology_granular_biobert_wip_en.md * Update 2023-01-15-redl_oncology_size_biobert_wip_en.md * Update 2023-01-14-redl_bodypart_direction_biobert_en.md * Update 2023-01-14-redl_bodypart_problem_biobert_en.md * Update 2023-01-14-redl_bodypart_procedure_test_biobert_en.md * Update 2023-01-14-redl_chemprot_biobert_en.md * Update 2023-01-14-redl_clinical_biobert_en.md * Update 2023-01-15-redl_oncology_biobert_wip_en.md * Update 2023-01-15-redl_oncology_biomarker_result_biobert_wip_en.md * Update 2023-01-15-redl_oncology_granular_biobert_wip_en.md * Add model 2023-01-15-redl_oncology_test_result_biobert_wip_en * Add model 2023-01-15-redl_temporal_events_biobert_en Co-authored-by: Meryem1425 Co-authored-by: Vildan <64216738+Meryem1425@users.noreply.github.com> --- .../2023-01-14-redl_ade_biobert_en.md | 208 +++++++++++++++++ ...1-14-redl_bodypart_direction_biobert_en.md | 201 ++++++++++++++++ ...-01-14-redl_bodypart_problem_biobert_en.md | 191 +++++++++++++++ ...redl_bodypart_procedure_test_biobert_en.md | 185 +++++++++++++++ .../2023-01-14-redl_chemprot_biobert_en.md | 211 +++++++++++++++++ .../2023-01-14-redl_clinical_biobert_en.md | 220 ++++++++++++++++++ ...023-01-14-redl_date_clinical_biobert_en.md | 193 +++++++++++++++ ...4-redl_drug_drug_interaction_biobert_en.md | 196 ++++++++++++++++ .../2023-01-14-redl_drugprot_biobert_en.md | 219 +++++++++++++++++ ...14-redl_human_phenotype_gene_biobert_en.md | 203 ++++++++++++++++ .../2023-01-15-redl_nihss_biobert_en.md | 182 +++++++++++++++ ...2023-01-15-redl_oncology_biobert_wip_en.md | 191 +++++++++++++++ ...ncology_biomarker_result_biobert_wip_en.md | 196 ++++++++++++++++ ...5-redl_oncology_granular_biobert_wip_en.md | 194 +++++++++++++++ ...5-redl_oncology_location_biobert_wip_en.md | 190 +++++++++++++++ ...01-15-redl_oncology_size_biobert_wip_en.md | 192 +++++++++++++++ ...5-redl_oncology_temporal_biobert_wip_en.md | 191 +++++++++++++++ ...edl_oncology_test_result_biobert_wip_en.md | 192 +++++++++++++++ ...3-01-15-redl_temporal_events_biobert_en.md | 197 ++++++++++++++++ 19 files changed, 3752 insertions(+) create mode 100644 docs/_posts/Meryem1425/2023-01-14-redl_ade_biobert_en.md create mode 100644 docs/_posts/Meryem1425/2023-01-14-redl_bodypart_direction_biobert_en.md create mode 100644 docs/_posts/Meryem1425/2023-01-14-redl_bodypart_problem_biobert_en.md create mode 100644 docs/_posts/Meryem1425/2023-01-14-redl_bodypart_procedure_test_biobert_en.md create mode 100644 docs/_posts/Meryem1425/2023-01-14-redl_chemprot_biobert_en.md create mode 100644 docs/_posts/Meryem1425/2023-01-14-redl_clinical_biobert_en.md create mode 100644 docs/_posts/Meryem1425/2023-01-14-redl_date_clinical_biobert_en.md create mode 100644 docs/_posts/Meryem1425/2023-01-14-redl_drug_drug_interaction_biobert_en.md create mode 100644 docs/_posts/Meryem1425/2023-01-14-redl_drugprot_biobert_en.md create mode 100644 docs/_posts/Meryem1425/2023-01-14-redl_human_phenotype_gene_biobert_en.md create mode 100644 docs/_posts/Meryem1425/2023-01-15-redl_nihss_biobert_en.md create mode 100644 docs/_posts/Meryem1425/2023-01-15-redl_oncology_biobert_wip_en.md create mode 100644 docs/_posts/Meryem1425/2023-01-15-redl_oncology_biomarker_result_biobert_wip_en.md create mode 100644 docs/_posts/Meryem1425/2023-01-15-redl_oncology_granular_biobert_wip_en.md create mode 100644 docs/_posts/Meryem1425/2023-01-15-redl_oncology_location_biobert_wip_en.md create mode 100644 docs/_posts/Meryem1425/2023-01-15-redl_oncology_size_biobert_wip_en.md create mode 100644 docs/_posts/Meryem1425/2023-01-15-redl_oncology_temporal_biobert_wip_en.md create mode 100644 docs/_posts/Meryem1425/2023-01-15-redl_oncology_test_result_biobert_wip_en.md create mode 100644 docs/_posts/Meryem1425/2023-01-15-redl_temporal_events_biobert_en.md diff --git a/docs/_posts/Meryem1425/2023-01-14-redl_ade_biobert_en.md b/docs/_posts/Meryem1425/2023-01-14-redl_ade_biobert_en.md new file mode 100644 index 00000000000000..d363b8575d847e --- /dev/null +++ b/docs/_posts/Meryem1425/2023-01-14-redl_ade_biobert_en.md @@ -0,0 +1,208 @@ +--- +layout: model +title: Relation extraction between Drugs and ADE (ReDL) +author: John Snow Labs +name: redl_ade_biobert +date: 2023-01-14 +tags: [relation_extraction, en, clinical, licensed, ade, biobert, tensorflow] +task: Relation Extraction +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is an end-to-end trained BioBERT model, capable of Relating Drugs and adverse reactions caused by them; It predicts if an adverse event is caused by a drug or not. 1 : Shows the adverse event and drug entities are related, 0 : Shows the adverse event and drug entities are not related. + +## Predicted Entities + +`0`, `1` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/RE_ADE/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/healthcare/RE_ADE.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/redl_ade_biobert_en_4.2.4_3.0_1673708531142.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documenter = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentencer = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentences") + +tokenizer = Tokenizer()\ + .setInputCols(["sentences"])\ + .setOutputCol("tokens") + +words_embedder = WordEmbeddingsModel() \ + .pretrained("embeddings_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"]) \ + .setOutputCol("embeddings") + +ner_tagger = MedicalNerModel.pretrained("ner_ade_clinical", "en", "clinical/models")\ + .setInputCols("sentences", "tokens", "embeddings")\ + .setOutputCol("ner_tags") + +ner_converter = NerConverterInternal() \ + .setInputCols(["sentences", "tokens", "ner_tags"]) \ + .setOutputCol("ner_chunks") + +pos_tagger = PerceptronModel()\ + .pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("pos_tags") + +dependency_parser = DependencyParserModel()\ + .pretrained("dependency_conllu", "en")\ + .setInputCols(["sentences", "pos_tags", "tokens"])\ + .setOutputCol("dependencies") + +# Set a filter on pairs of named entities which will be treated as relation candidates +re_ner_chunk_filter = RENerChunksFilter() \ + .setInputCols(["ner_chunks", "dependencies"])\ + .setMaxSyntacticDistance(10)\ + .setOutputCol("re_ner_chunks")\ + .setRelationPairs(['ade-drug', 'drug-ade']) + +# The dataset this model is trained to is sentence-wise. +# This model can also be trained on document-level relations - in which case, while predicting, use "document" instead of "sentence" as input. +re_model = RelationExtractionDLModel()\ + .pretrained('redl_ade_biobert', 'en', "clinical/models") \ + .setPredictionThreshold(0.5)\ + .setInputCols(["re_ner_chunks", "sentences"]) \ + .setOutputCol("relations") + +pipeline = Pipeline(stages=[documenter, + sentencer, + tokenizer, + pos_tagger, + words_embedder, + ner_tagger, + ner_converter, + dependency_parser, + re_ner_chunk_filter, + re_model]) + +light_pipeline = LightPipeline(pipeline.fit(spark.createDataFrame([[""]]).toDF("text"))) + +text ="""Been taking Lipitor for 15 years , have experienced severe fatigue a lot. The doctor moved me to voltarene 2 months ago, so far I have only had muscle cramps.""" + +annotations = light_pipeline.fullAnnotate(text) +``` +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentencer = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentences") + +val tokenizer = new Tokenizer() + .setInputCols("sentences") + .setOutputCol("tokens") + +val words_embedder = WordEmbeddingsModel() + .pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("embeddings") + +val ner_tagger = MedicalNerModel.pretrained("ner_ade_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens", "embeddings")) + .setOutputCol("ner_tags") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentences", "tokens", "ner_tags")) + .setOutputCol("ner_chunks") + +val pos_tagger = PerceptronModel() + .pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("pos_tags") + +val dependency_parser = DependencyParserModel() + .pretrained("dependency_conllu", "en") + .setInputCols(Array("sentences", "pos_tags", "tokens")) + .setOutputCol("dependencies") + +// Set a filter on pairs of named entities which will be treated as relation candidates +val re_ner_chunk_filter = new RENerChunksFilter() + .setInputCols(Array("ner_chunks", "dependencies")) + .setMaxSyntacticDistance(10) + .setOutputCol("re_ner_chunks") + .setRelationPairs(Array("drug-ade", "ade-drug")) + +// The dataset this model is trained to is sentence-wise. +// This model can also be trained on document-level relations - in which case, while predicting, use "document" instead of "sentence" as input. +val re_model = RelationExtractionDLModel() + .pretrained("redl_ade_biobert", "en", "clinical/models") + .setPredictionThreshold(0.5) + .setInputCols(Array("re_ner_chunks", "sentences")) + .setOutputCol("relations") + +val pipeline = new Pipeline().setStages(Array(documenter, + sentencer, + tokenizer, + words_embedder, + ner_tagger, + ner_converter, + pos_tagger, + dependency_parser, + re_ner_chunk_filter, + re_model)) + +val data = Seq("""Been taking Lipitor for 15 years , have experienced severe fatigue a lot. The doctor moved me to voltarene 2 months ago, so far I have only had muscle cramps.""").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| relation | entity1 | entity1_begin | entity1_end | chunk1 | entity2 | entity2_begin | entity2_end | chunk2 | confidence | +|---------:|:--------|--------------:|------------:|:----------|:--------|--------------:|------------:|:---------------|-----------:| +| 1 | DRUG | 12 | 18 | Lipitor | ADE | 52 | 65 | severe fatigue | 0.999317 | +| 0 | DRUG | 97 | 105 | voltarene | ADE | 144 | 156 | muscle cramps | 0.774904 | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|redl_ade_biobert| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|401.7 MB| + +## References + +This model is trained on custom data annotated by JSL. + +## Benchmarking + +```bash +label Recall Precision F1 Support +0 0.829 0.895 0.861 1146 +1 0.955 0.923 0.939 2454 +Avg. 0.892 0.909 0.900 - +Weighted-Avg. 0.915 0.914 0.914 - +``` diff --git a/docs/_posts/Meryem1425/2023-01-14-redl_bodypart_direction_biobert_en.md b/docs/_posts/Meryem1425/2023-01-14-redl_bodypart_direction_biobert_en.md new file mode 100644 index 00000000000000..c38366f71c7c9e --- /dev/null +++ b/docs/_posts/Meryem1425/2023-01-14-redl_bodypart_direction_biobert_en.md @@ -0,0 +1,201 @@ +--- +layout: model +title: Relation Extraction Between Body Parts and Direction Entities (ReDL) +author: John Snow Labs +name: redl_bodypart_direction_biobert +date: 2023-01-14 +tags: [licensed, en, clinical, relation_extraction, tensorflow] +task: Relation Extraction +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Relation extraction between body parts entities like Internal_organ_or_component, External_body_part_or_region etc. and direction entities like upper, lower in clinical texts. 1 : Shows the body part and direction entity are related, 0 : Shows the body part and direction entity are not related. + +## Predicted Entities + +`1`, `0` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.1.Clinical_Relation_Extraction_BodyParts_Models.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/redl_bodypart_direction_biobert_en_4.2.4_3.0_1673710170047.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documenter = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentencer = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentences") + +tokenizer = Tokenizer()\ + .setInputCols(["sentences"])\ + .setOutputCol("tokens") + +pos_tagger = PerceptronModel()\ + .pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("pos_tags") + +words_embedder = WordEmbeddingsModel() \ + .pretrained("embeddings_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"]) \ + .setOutputCol("embeddings") + +ner_tagger = MedicalNerModel.pretrained("ner_jsl_greedy", "en", "clinical/models")\ + .setInputCols("sentences", "tokens", "embeddings")\ + .setOutputCol("ner_tags") + +ner_converter = NerConverterInternal() \ + .setInputCols(["sentences", "tokens", "ner_tags"]) \ + .setOutputCol("ner_chunks") + +dependency_parser = DependencyParserModel() \ + .pretrained("dependency_conllu", "en") \ + .setInputCols(["sentences", "pos_tags", "tokens"]) \ + .setOutputCol("dependencies") + +# Set a filter on pairs of named entities which will be treated as relation candidates +re_ner_chunk_filter = RENerChunksFilter() \ + .setInputCols(["ner_chunks", "dependencies"])\ + .setMaxSyntacticDistance(10)\ + .setOutputCol("re_ner_chunks")\ + .setRelationPairs(['direction-external_body_part_or_region', + 'external_body_part_or_region-direction', + 'direction-internal_organ_or_component', + 'internal_organ_or_component-direction' + ]) + +# The dataset this model is trained to is sentence-wise. +# This model can also be trained on document-level relations - in which case, while predicting, use "document" instead of "sentence" as input. +re_model = RelationExtractionDLModel()\ + .pretrained('redl_bodypart_direction_biobert', 'en', "clinical/models") \ + .setPredictionThreshold(0.5)\ + .setInputCols(["re_ner_chunks", "sentences"]) \ + .setOutputCol("relations") + +pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, pos_tagger, words_embedder, ner_tagger, ner_converter, dependency_parser, re_ner_chunk_filter, re_model]) + +data = spark.createDataFrame([[''' MRI demonstrated infarction in the upper brain stem , left cerebellum and right basil ganglia ''']]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentencer = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentences") + +val tokenizer = new Tokenizer() + .setInputCols("sentences") + .setOutputCol("tokens") + +val pos_tagger = PerceptronModel() + .pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("pos_tags") + +val words_embedder = WordEmbeddingsModel() + .pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("embeddings") + +val ner_tagger = MedicalNerModel.pretrained("ner_jsl_greedy", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens", "embeddings")) + .setOutputCol("ner_tags") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentences", "tokens", "ner_tags")) + .setOutputCol("ner_chunks") + +val dependency_parser = DependencyParserModel() + .pretrained("dependency_conllu", "en") + .setInputCols(Array("sentences", "pos_tags", "tokens")) + .setOutputCol("dependencies") + +// Set a filter on pairs of named entities which will be treated as relation candidates +val re_ner_chunk_filter = new RENerChunksFilter() + .setInputCols(Array("ner_chunks", "dependencies")) + .setMaxSyntacticDistance(10) + .setOutputCol("re_ner_chunks") + .setRelationPairs(Array("direction-external_body_part_or_region", + "external_body_part_or_region-direction", + "direction-internal_organ_or_component", + "internal_organ_or_component-direction")) + +// The dataset this model is trained to is sentence-wise. +// This model can also be trained on document-level relations - in which case, while predicting, use "document" instead of "sentence" as input. +val re_model = RelationExtractionDLModel() + .pretrained("redl_bodypart_direction_biobert", "en", "clinical/models") + .setPredictionThreshold(0.5) + .setInputCols(Array("re_ner_chunks", "sentences")) + .setOutputCol("relations") + +val pipeline = new Pipeline().setStages(Array(documenter, sentencer, tokenizer, pos_tagger, words_embedder, ner_tagger, ner_converter, dependency_parser, re_ner_chunk_filter, re_model)) + +val data = Seq("MRI demonstrated infarction in the upper brain stem , left cerebellum and right basil ganglia").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| index | relations | entity1 | entity1_begin | entity1_end | chunk1 | entity2 | entity2_end | entity2_end | chunk2 | confidence | +|-------|-----------|-----------------------------|---------------|-------------|------------|-----------------------------|-------------|-------------|---------------|------------| +| 0 | 1 | Direction | 35 | 39 | upper | Internal_organ_or_component | 41 | 50 | brain stem | 0.9999989 | +| 1 | 0 | Direction | 35 | 39 | upper | Internal_organ_or_component | 59 | 68 | cerebellum | 0.99992585 | +| 2 | 0 | Direction | 35 | 39 | upper | Internal_organ_or_component | 81 | 93 | basil ganglia | 0.9999999 | +| 3 | 0 | Internal_organ_or_component | 41 | 50 | brain stem | Direction | 54 | 57 | left | 0.999811 | +| 4 | 0 | Internal_organ_or_component | 41 | 50 | brain stem | Direction | 75 | 79 | right | 0.9998203 | +| 5 | 1 | Direction | 54 | 57 | left | Internal_organ_or_component | 59 | 68 | cerebellum | 1.0 | +| 6 | 0 | Direction | 54 | 57 | left | Internal_organ_or_component | 81 | 93 | basil ganglia | 0.97616416 | +| 7 | 0 | Internal_organ_or_component | 59 | 68 | cerebellum | Direction | 75 | 79 | right | 0.953046 | +| 8 | 1 | Direction | 75 | 79 | right | Internal_organ_or_component | 81 | 93 | basil ganglia | 1.0 | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|redl_bodypart_direction_biobert| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|401.7 MB| + +## References + +Trained on an internal dataset. + +## Benchmarking + +```bash +label Recall Precision F1 Support +0 0.856 0.873 0.865 153 +1 0.986 0.984 0.985 1347 +Avg. 0.921 0.929 0.925 - +``` diff --git a/docs/_posts/Meryem1425/2023-01-14-redl_bodypart_problem_biobert_en.md b/docs/_posts/Meryem1425/2023-01-14-redl_bodypart_problem_biobert_en.md new file mode 100644 index 00000000000000..dd8a7c46918f99 --- /dev/null +++ b/docs/_posts/Meryem1425/2023-01-14-redl_bodypart_problem_biobert_en.md @@ -0,0 +1,191 @@ +--- +layout: model +title: Relation Extraction Between Body Parts and Problem Entities (ReDL) +author: John Snow Labs +name: redl_bodypart_problem_biobert +date: 2023-01-14 +tags: [licensed, en, clinical, relation_extraction, tensorflow] +task: Relation Extraction +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Relation extraction between body parts and problem entities in clinical texts. 1 : Shows that there is a relation between body part entity and entities labeled as problem ( diagnosis, symptom etc.), 0 : Shows that there no relation between body part and problem entities. + +## Predicted Entities + +`0`, `1` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.1.Clinical_Relation_Extraction_BodyParts_Models.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/redl_bodypart_problem_biobert_en_4.2.4_3.0_1673713187801.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documenter = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentencer = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentences") + +tokenizer = Tokenizer()\ + .setInputCols(["sentences"])\ + .setOutputCol("tokens") + +pos_tagger = PerceptronModel()\ + .pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("pos_tags") + +words_embedder = WordEmbeddingsModel() \ + .pretrained("embeddings_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"]) \ + .setOutputCol("embeddings") + +ner_tagger = MedicalNerModel.pretrained("ner_jsl_greedy", "en", "clinical/models")\ + .setInputCols("sentences", "tokens", "embeddings")\ + .setOutputCol("ner_tags") + +ner_converter = NerConverterInternal() \ + .setInputCols(["sentences", "tokens", "ner_tags"]) \ + .setOutputCol("ner_chunks") + +dependency_parser = DependencyParserModel() \ + .pretrained("dependency_conllu", "en") \ + .setInputCols(["sentences", "pos_tags", "tokens"]) \ + .setOutputCol("dependencies") + +# Set a filter on pairs of named entities which will be treated as relation candidates +re_ner_chunk_filter = RENerChunksFilter() \ + .setInputCols(["ner_chunks", "dependencies"])\ + .setMaxSyntacticDistance(10)\ + .setOutputCol("re_ner_chunks")\ + .setRelationPairs(['SYMPTOM-EXTERNAL_BODY_PART_OR_REGION',"EXTERNAL_BODY_PART_OR_REGION-SYMPTOM"]) + +# The dataset this model is trained to is sentence-wise. +# This model can also be trained on document-level relations - in which case, while predicting, use "document" instead of "sentence" as input. +re_model = RelationExtractionDLModel()\ + .pretrained('redl_bodypart_problem_biobert', 'en', "clinical/models") \ + .setPredictionThreshold(0.5)\ + .setInputCols(["re_ner_chunks", "sentences"]) \ + .setOutputCol("relations") + +pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, pos_tagger, words_embedder, ner_tagger, ner_converter, dependency_parser, re_ner_chunk_filter, re_model]) + +text ="No neurologic deficits other than some numbness in his left hand." + +data = spark.createDataFrame([[text]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentencer = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentences") + +val tokenizer = new Tokenizer() + .setInputCols("sentences") + .setOutputCol("tokens") + +val pos_tagger = PerceptronModel() + .pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("pos_tags") + +val words_embedder = WordEmbeddingsModel() + .pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("embeddings") + +val ner_tagger = MedicalNerModel.pretrained("ner_jsl_greedy", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens", "embeddings")) + .setOutputCol("ner_tags") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentences", "tokens", "ner_tags")) + .setOutputCol("ner_chunks") + +val dependency_parser = DependencyParserModel() + .pretrained("dependency_conllu", "en") + .setInputCols(Array("sentences", "pos_tags", "tokens")) + .setOutputCol("dependencies") + +// Set a filter on pairs of named entities which will be treated as relation candidates +val re_ner_chunk_filter = new RENerChunksFilter() + .setInputCols(Array("ner_chunks", "dependencies")) + .setMaxSyntacticDistance(10) + .setOutputCol("re_ner_chunks") + .setRelationPairs(Array("SYMPTOM-EXTERNAL_BODY_PART_OR_REGION","EXTERNAL_BODY_PART_OR_REGION-SYMPTOM")) + +// The dataset this model is trained to is sentence-wise. +// This model can also be trained on document-level relations - in which case, while predicting, use "document" instead of "sentence" as input. +val re_model = RelationExtractionDLModel() + .pretrained("redl_bodypart_problem_biobert", "en", "clinical/models") + .setPredictionThreshold(0.5) + .setInputCols(Array("re_ner_chunks", "sentences")) + .setOutputCol("relations") + +val pipeline = new Pipeline().setStages(Array(documenter, sentencer, tokenizer, pos_tagger, words_embedder, ner_tagger, ner_converter, dependency_parser, re_ner_chunk_filter, re_model)) + +val data = Seq("No neurologic deficits other than some numbness in his left hand.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++--------+-------+-------------------+----------------------------+------+----------+ +|relation|entity1|chunk1 |entity2 |chunk2|confidence| ++--------+-------+-------------------+----------------------------+------+----------+ +|0 |Symptom|neurologic deficits|External_body_part_or_region|hand |0.8320218 | +|1 |Symptom|numbness |External_body_part_or_region|hand |0.99943227| ++--------+-------+-------------------+----------------------------+------+----------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|redl_bodypart_problem_biobert| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|401.7 MB| + +## References + +Trained on internal dataset. + +## Benchmarking + +```bash +label Recall Precision F1 Support +0 0.762 0.814 0.787 315 +1 0.938 0.917 0.927 885 +Avg. 0.850 0.865 0.857 - +``` diff --git a/docs/_posts/Meryem1425/2023-01-14-redl_bodypart_procedure_test_biobert_en.md b/docs/_posts/Meryem1425/2023-01-14-redl_bodypart_procedure_test_biobert_en.md new file mode 100644 index 00000000000000..8e98c7997ebc46 --- /dev/null +++ b/docs/_posts/Meryem1425/2023-01-14-redl_bodypart_procedure_test_biobert_en.md @@ -0,0 +1,185 @@ +--- +layout: model +title: Relation Extraction Between Body Parts and Procedures +author: John Snow Labs +name: redl_bodypart_procedure_test_biobert +date: 2023-01-14 +tags: [relation_extraction, en, clinical, dl, licensed, tensorflow] +task: Relation Extraction +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Relation extraction between body parts entities like ‘Internal_organ_or_component’, ’External_body_part_or_region’ etc. and procedure and test entities. 1 : body part and test/procedure are related to each other. 0 : body part and test/procedure are not related to each other. + +## Predicted Entities + +`1`, `0` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/redl_bodypart_procedure_test_biobert_en_4.2.4_3.0_1673714088228.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documenter = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentencer = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentences") + +tokenizer = Tokenizer()\ + .setInputCols(["sentences"])\ + .setOutputCol("tokens") + +pos_tagger = PerceptronModel()\ + .pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("pos_tags") + +words_embedder = WordEmbeddingsModel() \ + .pretrained("embeddings_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"]) \ + .setOutputCol("embeddings") + +ner_tagger = MedicalNerModel.pretrained("ner_jsl_greedy", "en", "clinical/models")\ + .setInputCols("sentences", "tokens", "embeddings")\ + .setOutputCol("ner_tags") + +ner_converter = NerConverterInternal() \ + .setInputCols(["sentences", "tokens", "ner_tags"]) \ + .setOutputCol("ner_chunks") + +dependency_parser = DependencyParserModel() \ + .pretrained("dependency_conllu", "en") \ + .setInputCols(["sentences", "pos_tags", "tokens"]) \ + .setOutputCol("dependencies") + +# Set a filter on pairs of named entities which will be treated as relation candidates +re_ner_chunk_filter = RENerChunksFilter() \ + .setInputCols(["ner_chunks", "dependencies"])\ + .setMaxSyntacticDistance(10)\ + .setOutputCol("re_ner_chunks")\ + .setRelationPairs(["external_body_part_or_region-test"]) + +# The dataset this model is trained to is sentence-wise. +# This model can also be trained on document-level relations - in which case, while predicting, use "document" instead of "sentence" as input. +re_model = RelationExtractionDLModel()\ + .pretrained('redl_bodypart_procedure_test_biobert', 'en', "clinical/models") \ + .setPredictionThreshold(0.5)\ + .setInputCols(["re_ner_chunks", "sentences"]) \ + .setOutputCol("relations") + +pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, pos_tagger, words_embedder, ner_tagger, ner_converter, dependency_parser, re_ner_chunk_filter, re_model]) + +data = spark.createDataFrame([['''TECHNIQUE IN DETAIL: After informed consent was obtained from the patient and his mother, the chest was scanned with portable ultrasound.''']]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentencer = new SentenceDetector() + .setInputCols(Array("document")) + .setOutputCol("sentences") + +val tokenizer = new Tokenizer() + .setInputCols(Array("sentences")) + .setOutputCol("tokens") + +val pos_tagger = PerceptronModel() + .pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("pos_tags") + +val words_embedder = WordEmbeddingsModel() + .pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("embeddings") + +val ner_tagger = MedicalNerModel.pretrained("ner_jsl_greedy", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens", "embeddings")) + .setOutputCol("ner_tags") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentences", "tokens", "ner_tags")) + .setOutputCol("ner_chunks") + +val dependency_parser = DependencyParserModel() + .pretrained("dependency_conllu", "en") + .setInputCols(Array("sentences", "pos_tags", "tokens")) + .setOutputCol("dependencies") + +// Set a filter on pairs of named entities which will be treated as relation candidates +val re_ner_chunk_filter = new RENerChunksFilter() + .setInputCols(Array("ner_chunks", "dependencies")) + .setMaxSyntacticDistance(10) + .setOutputCol("re_ner_chunks") + .setRelationPairs("external_body_part_or_region-test") + +// The dataset this model is trained to is sentence-wise. +// This model can also be trained on document-level relations - in which case, while predicting, use "document" instead of "sentence" as input. +val re_model = RelationExtractionDLModel() + .pretrained("redl_bodypart_procedure_test_biobert", "en", "clinical/models") + .setPredictionThreshold(0.5) + .setInputCols(Array("re_ner_chunks", "sentences")) + .setOutputCol("relations") + +val pipeline = new Pipeline().setStages(Array(documenter, sentencer, tokenizer, pos_tagger, words_embedder, ner_tagger, ner_converter, dependency_parser, re_ner_chunk_filter, re_model)) + +val data = Seq("""TECHNIQUE IN DETAIL: After informed consent was obtained from the patient and his mother, the chest was scanned with portable ultrasound.""").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| | relation | entity1 | chunk1 | entity2 | chunk2 | confidence | +|---:|-----------:|:-----------------------------|:---------|:----------|:--------------------|-------------:| +| 0 | 1 | External_body_part_or_region | chest | Test | portable ultrasound | 0.99953 | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|redl_bodypart_procedure_test_biobert| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|401.7 MB| + +## References + +Trained on a custom internal dataset. + +## Benchmarking + +```bash +label Recall Precision F1 Support +0 0.338 0.472 0.394 325 +1 0.904 0.843 0.872 1275 +Avg. 0.621 0.657 0.633 - +``` diff --git a/docs/_posts/Meryem1425/2023-01-14-redl_chemprot_biobert_en.md b/docs/_posts/Meryem1425/2023-01-14-redl_chemprot_biobert_en.md new file mode 100644 index 00000000000000..5c2f69dae66cc9 --- /dev/null +++ b/docs/_posts/Meryem1425/2023-01-14-redl_chemprot_biobert_en.md @@ -0,0 +1,211 @@ +--- +layout: model +title: Extract relations between chemicals and proteins (ReDL) +author: John Snow Labs +name: redl_chemprot_biobert +date: 2023-01-14 +tags: [relation_extraction, licensed, en, clinical, tensorflow] +task: Relation Extraction +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Detect interactions between chemicals and proteins using BERT model by classifying whether a specified semantic relation holds between the chemical and protein entities within a sentence or document. + +## Predicted Entities + +`CPR:1`, `CPR:2`, `CPR:3`, `CPR:4`, `CPR:5`, `CPR:6`, `CPR:7`, `CPR:8`, `CPR:9`, `CPR:10` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/RE_CHEM_PROT/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.Clinical_Relation_Extraction.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/redl_chemprot_biobert_en_4.2.4_3.0_1673714908415.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + +In the table below, `redl_chemprot_biobert` RE model, its labels, optimal NER model, and meaningful relation pairs are illustrated. + +| RE MODEL | RE MODEL LABES | NER MODEL | RE PAIRS | +|:---------------------:|:---------------------------------------------------------------------:|:---------------------:|---------------------------| +| redl_chemprot_biobert | CPR:1, CPR:2, CPR:3, CPR:4, CPR:5, CPR:6, CPR:7, CPR:8, CPR:9, CPR:10 | ner_chemprot_clinical | [“No need to set pairs.”] | + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documenter = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentencer = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentences") + +tokenizer = Tokenizer()\ + .setInputCols(["sentences"])\ + .setOutputCol("tokens") + +pos_tagger = PerceptronModel()\ + .pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("pos_tags") + +words_embedder = WordEmbeddingsModel() \ + .pretrained("embeddings_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"]) \ + .setOutputCol("embeddings") + +ner_tagger = MedicalNerModel.pretrained("ner_chemprot_clinical", "en", "clinical/models")\ + .setInputCols("sentences", "tokens", "embeddings")\ + .setOutputCol("ner_tags") + +ner_converter = NerConverterInternal() \ + .setInputCols(["sentences", "tokens", "ner_tags"]) \ + .setOutputCol("ner_chunks") + +dependency_parser = DependencyParserModel() \ + .pretrained("dependency_conllu", "en") \ + .setInputCols(["sentences", "pos_tags", "tokens"]) \ + .setOutputCol("dependencies") + +# Set a filter on pairs of named entities which will be treated as relation candidates +re_ner_chunk_filter = RENerChunksFilter() \ + .setInputCols(["ner_chunks", "dependencies"])\ + .setMaxSyntacticDistance(10)\ + .setOutputCol("re_ner_chunks") + #.setRelationPairs(['SYMPTOM-EXTERNAL_BODY_PART_OR_REGION']) + +# The dataset this model is trained to is sentence-wise. +# This model can also be trained on document-level relations - in which case, while predicting, use "document" instead of "sentence" as input. +re_model = RelationExtractionDLModel()\ + .pretrained('redl_chemprot_biobert', 'en', "clinical/models") \ + .setPredictionThreshold(0.5)\ + .setInputCols(["re_ner_chunks", "sentences"]) \ + .setOutputCol("relations") + +pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, pos_tagger, words_embedder, ner_tagger, ner_converter, dependency_parser, re_ner_chunk_filter, re_model]) + +text='''In this study, we examined the effects of mitiglinide on various cloned K(ATP) channels (Kir6.2/SUR1, Kir6.2/SUR2A, and Kir6.2/SUR2B) reconstituted in COS-1 cells, and compared them to another meglitinide-related compound, nateglinide. Patch-clamp analysis using inside-out recording configuration showed that mitiglinide inhibits the Kir6.2/SUR1 channel currents in a dose-dependent manner (IC50 value, 100 nM) but does not significantly inhibit either Kir6.2/SUR2A or Kir6.2/SUR2B channel currents even at high doses (more than 10 microM). Nateglinide inhibits Kir6.2/SUR1 and Kir6.2/SUR2B channels at 100 nM, and inhibits Kir6.2/SUR2A channels at high concentrations (1 microM). Binding experiments on mitiglinide, nateglinide, and repaglinide to SUR1 expressed in COS-1 cells revealed that they inhibit the binding of [3H]glibenclamide to SUR1 (IC50 values: mitiglinide, 280 nM; nateglinide, 8 microM; repaglinide, 1.6 microM), suggesting that they all share a glibenclamide binding site. The insulin responses to glucose, mitiglinide, tolbutamide, and glibenclamide in MIN6 cells after chronic mitiglinide, nateglinide, or repaglinide treatment were comparable to those after chronic tolbutamide and glibenclamide treatment. These results indicate that, similar to the sulfonylureas, mitiglinide is highly specific to the Kir6.2/SUR1 complex, i.e., the pancreatic beta-cell K(ATP) channel, and suggest that mitiglinide may be a clinically useful anti-diabetic drug.''' + +data = spark.createDataFrame([[text]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentencer = new SentenceDetector() + .setInputCols(Array("document")) + .setOutputCol("sentences") + +val tokenizer = new Tokenizer() + .setInputCols(Array("sentences")) + .setOutputCol("tokens") + +val pos_tagger = PerceptronModel() + .pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("pos_tags") + +val words_embedder = WordEmbeddingsModel() + .pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("embeddings") + +val ner_tagger = MedicalNerModel.pretrained("ner_chemprot_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens", "embeddings")) + .setOutputCol("ner_tags") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentences", "tokens", "ner_tags")) + .setOutputCol("ner_chunks") + +val dependency_parser = DependencyParserModel() + .pretrained("dependency_conllu", "en") + .setInputCols(Array("sentences", "pos_tags", "tokens")) + .setOutputCol("dependencies") + +// Set a filter on pairs of named entities which will be treated as relation candidates +val re_ner_chunk_filter = new RENerChunksFilter() + .setInputCols(Array("ner_chunks", "dependencies")) + .setMaxSyntacticDistance(10) + .setOutputCol("re_ner_chunks") + // .setRelationPairs(Array("SYMPTOM-EXTERNAL_BODY_PART_OR_REGION")) + +// The dataset this model is trained to is sentence-wise. +// This model can also be trained on document-level relations - in which case, while predicting, use "document" instead of "sentence" as input. +val re_model = RelationExtractionDLModel() + .pretrained("redl_chemprot_biobert", "en", "clinical/models") + .setPredictionThreshold(0.5) + .setInputCols(Array("re_ner_chunks", "sentences")) + .setOutputCol("relations") + +val pipeline = new Pipeline().setStages(Array(documenter, sentencer, tokenizer, pos_tagger, words_embedder, ner_tagger, ner_converter, dependency_parser, re_ner_chunk_filter, re_model)) + +val data = Seq("In this study, we examined the effects of mitiglinide on various cloned K(ATP) channels (Kir6.2/SUR1, Kir6.2/SUR2A, and Kir6.2/SUR2B) reconstituted in COS-1 cells, and compared them to another meglitinide-related compound, nateglinide. Patch-clamp analysis using inside-out recording configuration showed that mitiglinide inhibits the Kir6.2/SUR1 channel currents in a dose-dependent manner (IC50 value, 100 nM) but does not significantly inhibit either Kir6.2/SUR2A or Kir6.2/SUR2B channel currents even at high doses (more than 10 microM). Nateglinide inhibits Kir6.2/SUR1 and Kir6.2/SUR2B channels at 100 nM, and inhibits Kir6.2/SUR2A channels at high concentrations (1 microM). Binding experiments on mitiglinide, nateglinide, and repaglinide to SUR1 expressed in COS-1 cells revealed that they inhibit the binding of [3H]glibenclamide to SUR1 (IC50 values: mitiglinide, 280 nM; nateglinide, 8 microM; repaglinide, 1.6 microM), suggesting that they all share a glibenclamide binding site. The insulin responses to glucose, mitiglinide, tolbutamide, and glibenclamide in MIN6 cells after chronic mitiglinide, nateglinide, or repaglinide treatment were comparable to those after chronic tolbutamide and glibenclamide treatment. These results indicate that, similar to the sulfonylureas, mitiglinide is highly specific to the Kir6.2/SUR1 complex, i.e., the pancreatic beta-cell K(ATP) channel, and suggest that mitiglinide may be a clinically useful anti-diabetic drug.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash +| | relation | entity1 | entity1_begin | entity1_end | chunk1 | entity2 | entity2_begin | entity2_end | chunk2 | confidence | +|---:|:-----------|:----------|----------------:|--------------:|:------------------|:----------|----------------:|--------------:|:--------------|-------------:| +| 0 | CPR:2 | CHEMICAL | 43 | 53 | mitiglinide | GENE-N | 80 | 87 | channels | 0.998399 | +| 1 | CPR:2 | GENE-N | 80 | 87 | channels | CHEMICAL | 224 | 234 | nateglinide | 0.994489 | +| 2 | CPR:2 | CHEMICAL | 706 | 716 | mitiglinide | GENE-Y | 751 | 754 | SUR1 | 0.999304 | +| 3 | CPR:2 | CHEMICAL | 823 | 839 | [3H]glibenclamide | GENE-Y | 844 | 847 | SUR1 | 0.998923 | +| 4 | CPR:2 | GENE-N | 998 | 1004 | insulin | CHEMICAL | 1019 | 1025 | glucose | 0.979057 | +| 5 | CPR:2 | GENE-N | 998 | 1004 | insulin | CHEMICAL | 1028 | 1038 | mitiglinide | 0.988504 | +| 6 | CPR:2 | GENE-N | 998 | 1004 | insulin | CHEMICAL | 1041 | 1051 | tolbutamide | 0.991856 | +| 7 | CPR:2 | GENE-N | 998 | 1004 | insulin | CHEMICAL | 1058 | 1070 | glibenclamide | 0.994092 | +| 8 | CPR:2 | GENE-N | 998 | 1004 | insulin | CHEMICAL | 1100 | 1110 | mitiglinide | 0.994409 | +| 9 | CPR:2 | CHEMICAL | 1290 | 1300 | mitiglinide | GENE-N | 1387 | 1393 | channel | 0.981534 | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|redl_chemprot_biobert| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|401.7 MB| + +## References + +Trained on ChemProt benchmark dataset. + +## Benchmarking + +```bash +label Recall Precision F1 Support +CPR:1 0.870 0.908 0.888 215 +CPR:10 0.818 0.762 0.789 258 +CPR:2 0.726 0.806 0.764 1651 +CPR:3 0.788 0.785 0.787 657 +CPR:4 0.901 0.855 0.878 1599 +CPR:5 0.799 0.891 0.842 184 +CPR:6 0.888 0.845 0.866 258 +CPR:7 0.520 0.765 0.619 25 +CPR:8 0.083 0.333 0.133 24 +CPR:9 0.930 0.805 0.863 629 +Avg. 0.732 0.775 0.743 - +``` diff --git a/docs/_posts/Meryem1425/2023-01-14-redl_clinical_biobert_en.md b/docs/_posts/Meryem1425/2023-01-14-redl_clinical_biobert_en.md new file mode 100644 index 00000000000000..fff66ec7f2b2db --- /dev/null +++ b/docs/_posts/Meryem1425/2023-01-14-redl_clinical_biobert_en.md @@ -0,0 +1,220 @@ +--- +layout: model +title: Extract relations between problem, treatment and test entities (ReDL) +author: John Snow Labs +name: redl_clinical_biobert +date: 2023-01-14 +tags: [en, licensed, relation_extraction, clinical, tensorflow] +task: Relation Extraction +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Extract relations like `TrIP` : a certain treatment has improved a medical problem and 7 other such relations between problem, treatment and test entities. + +## Predicted Entities + +`PIP`, `TeCP`, `TeRP`, `TrAP`, `TrCP`, `TrIP`, `TrNAP`, `TrWP` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/RE_CLINICAL/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.Clinical_Relation_Extraction.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/redl_clinical_biobert_en_4.2.4_3.0_1673727174891.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documenter = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentencer = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentences") + +tokenizer = Tokenizer()\ + .setInputCols(["sentences"])\ + .setOutputCol("tokens") + +pos_tagger = PerceptronModel()\ + .pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("pos_tags") + +words_embedder = WordEmbeddingsModel() \ + .pretrained("embeddings_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"]) \ + .setOutputCol("embeddings") + +ner_tagger = MedicalNerModel() \ + .pretrained("ner_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens", "embeddings"]) \ + .setOutputCol("ner_tags") + +ner_converter = NerConverterInternal() \ + .setInputCols(["sentences", "tokens", "ner_tags"]) \ + .setOutputCol("ner_chunks") + +dependency_parser = DependencyParserModel() \ + .pretrained("dependency_conllu", "en") \ + .setInputCols(["sentences", "pos_tags", "tokens"]) \ + .setOutputCol("dependencies") + +# Set a filter on pairs of named entities which will be treated as relation candidates +re_ner_chunk_filter = RENerChunksFilter() \ + .setInputCols(["ner_chunks", "dependencies"])\ + .setMaxSyntacticDistance(10)\ + .setOutputCol("re_ner_chunks")\ + .setRelationPairs(["problem-test", "problem-treatment"]) + +# The dataset this model is trained to is sentence-wise. +# This model can also be trained on document-level relations - in which case, while predicting, use "document" instead of "sentence" as input. +re_model = RelationExtractionDLModel()\ + .pretrained('redl_clinical_biobert', 'en', "clinical/models") \ + .setPredictionThreshold(0.5)\ + .setInputCols(["re_ner_chunks", "sentences"]) \ + .setOutputCol("relations") + +pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, pos_tagger, words_embedder, ner_tagger, ner_converter, dependency_parser, re_ner_chunk_filter, re_model]) + +text ="""A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus ( T2DM ), +one prior episode of HTG-induced pancreatitis three years prior to presentation, associated with an acute hepatitis , and obesity with a body mass index ( BMI ) of 33.5 kg/m2 , presented with a one-week history of polyuria , polydipsia , poor appetite , and vomiting . Two weeks prior to presentation , she was treated with a five-day course of amoxicillin for a respiratory tract infection . She was on metformin , glipizide , and dapagliflozin for T2DM and atorvastatin and gemfibrozil for HTG . She had been on dapagliflozin for six months at the time of presentation. Physical examination on presentation was significant for dry oral mucosa ; significantly , her abdominal examination was benign with no tenderness , guarding , or rigidity . Pertinent laboratory findings on admission were : serum glucose 111 mg/dl , bicarbonate 18 mmol/l , anion gap 20 , creatinine 0.4 mg/dL , triglycerides 508 mg/dL , total cholesterol 122 mg/dL , glycated hemoglobin ( HbA1c ) 10% , and venous pH 7.27 . Serum lipase was normal at 43 U/L . Serum acetone levels could not be assessed as blood samples kept hemolyzing due to significant lipemia . The patient was initially admitted for starvation ketosis , as she reported poor oral intake for three days prior to admission . However , serum chemistry obtained six hours after presentation revealed her glucose was 186 mg/dL , the anion gap was still elevated at 21 , serum bicarbonate was 16 mmol/L , triglyceride level peaked at 2050 mg/dL , and lipase was 52 U/L . The β-hydroxybutyrate level was obtained and found to be elevated at 5.29 mmol/L - the original sample was centrifuged and the chylomicron layer removed prior to analysis due to interference from turbidity caused by lipemia again . The patient was treated with an insulin drip for euDKA and HTG with a reduction in the anion gap to 13 and triglycerides to 1400 mg/dL , within 24 hours . Her euDKA was thought to be precipitated by her respiratory tract infection in the setting of SGLT2 inhibitor use . The patient was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , 12 units of insulin lispro with meals , and metformin 1000 mg two times a day . It was determined that all SGLT2 inhibitors should be discontinued indefinitely . +She had close follow-up with endocrinology post discharge . +""" + +data = spark.createDataFrame([[text]]).toDF("text") + +p_model = pipeline.fit(data) + +result = p_model.transform(data) +``` +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentencer = new SentenceDetector() + .setInputCols(Array("document")) + .setOutputCol("sentences") + +val tokenizer = new Tokenizer() + .setInputCols(Array("sentences")) + .setOutputCol("tokens") + +val pos_tagger = PerceptronModel() + .pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("pos_tags") + +val words_embedder = WordEmbeddingsModel() + .pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("embeddings") + +val ner_tagger = MedicalNerModel.pretrained("ner_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens", "embeddings")) + .setOutputCol("ner_tags") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentences", "tokens", "ner_tags")) + .setOutputCol("ner_chunks") + +val dependency_parser = DependencyParserModel() + .pretrained("dependency_conllu", "en") + .setInputCols(Array("sentences", "pos_tags", "tokens")) + .setOutputCol("dependencies") + +// Set a filter on pairs of named entities which will be treated as relation candidates +val re_ner_chunk_filter = new RENerChunksFilter() + .setInputCols(Array("ner_chunks", "dependencies")) + .setMaxSyntacticDistance(10) + .setOutputCol("re_ner_chunks") + .setRelationPairs(Array("problem-test", "problem-treatment")) + +// The dataset this model is trained to is sentence-wise. +// This model can also be trained on document-level relations - in which case, while predicting, use "document" instead of "sentence" as input. +val re_model = RelationExtractionDLModel() + .pretrained("redl_clinical_biobert", "en", "clinical/models") + .setPredictionThreshold(0.5) + .setInputCols(Array("re_ner_chunks", "sentences")) + .setOutputCol("relations") + +val pipeline = new Pipeline().setStages(Array(documenter, sentencer, tokenizer, pos_tagger, words_embedder, ner_tagger, ner_converter, dependency_parser, re_ner_chunk_filter, re_model)) + +val data = Seq("""A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus ( T2DM ), one prior episode of HTG-induced pancreatitis three years prior to presentation, associated with an acute hepatitis , and obesity with a body mass index ( BMI ) of 33.5 kg/m2 , presented with a one-week history of polyuria , polydipsia , poor appetite , and vomiting . Two weeks prior to presentation , she was treated with a five-day course of amoxicillin for a respiratory tract infection . She was on metformin , glipizide , and dapagliflozin for T2DM and atorvastatin and gemfibrozil for HTG . She had been on dapagliflozin for six months at the time of presentation. Physical examination on presentation was significant for dry oral mucosa ; significantly , her abdominal examination was benign with no tenderness , guarding , or rigidity . Pertinent laboratory findings on admission were : serum glucose 111 mg/dl , bicarbonate 18 mmol/l , anion gap 20 , creatinine 0.4 mg/dL , triglycerides 508 mg/dL , total cholesterol 122 mg/dL , glycated hemoglobin ( HbA1c ) 10% , and venous pH 7.27 . Serum lipase was normal at 43 U/L . Serum acetone levels could not be assessed as blood samples kept hemolyzing due to significant lipemia . The patient was initially admitted for starvation ketosis , as she reported poor oral intake for three days prior to admission . However , serum chemistry obtained six hours after presentation revealed her glucose was 186 mg/dL , the anion gap was still elevated at 21 , serum bicarbonate was 16 mmol/L , triglyceride level peaked at 2050 mg/dL , and lipase was 52 U/L . The β-hydroxybutyrate level was obtained and found to be elevated at 5.29 mmol/L - the original sample was centrifuged and the chylomicron layer removed prior to analysis due to interference from turbidity caused by lipemia again . The patient was treated with an insulin drip for euDKA and HTG with a reduction in the anion gap to 13 and triglycerides to 1400 mg/dL , within 24 hours . Her euDKA was thought to be precipitated by her respiratory tract infection in the setting of SGLT2 inhibitor use . The patient was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , 12 units of insulin lispro with meals , and metformin 1000 mg two times a day . It was determined that all SGLT2 inhibitors should be discontinued indefinitely . She had close follow-up with endocrinology post discharge.""").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++--------+---------+-------------+-----------+--------------------+---------+-------------+-----------+--------------------+----------+ +|relation| entity1|entity1_begin|entity1_end| chunk1| entity2|entity2_begin|entity2_end| chunk2|confidence| ++--------+---------+-------------+-----------+--------------------+---------+-------------+-----------+--------------------+----------+ +| TrAP|TREATMENT| 512| 522| amoxicillin| PROBLEM| 528| 556|a respiratory tra...|0.99863595| +| TrAP|TREATMENT| 571| 579| metformin| PROBLEM| 617| 620| T2DM|0.99126583| +| TrAP|TREATMENT| 583| 591| glipizide| PROBLEM| 617| 620| T2DM|0.99036837| +| TrAP|TREATMENT| 583| 591| glipizide| PROBLEM| 659| 661| HTG|0.53253245| +| TrAP|TREATMENT| 599| 611| dapagliflozin| PROBLEM| 617| 620| T2DM| 0.9954288| +| TrAP|TREATMENT| 599| 611| dapagliflozin| PROBLEM| 659| 661| HTG|0.95774424| +| TrAP| PROBLEM| 617| 620| T2DM|TREATMENT| 626| 637| atorvastatin| 0.9347153| +| TrAP| PROBLEM| 617| 620| T2DM|TREATMENT| 643| 653| gemfibrozil|0.97919524| +| TrAP|TREATMENT| 626| 637| atorvastatin| PROBLEM| 659| 661| HTG| 0.7040749| +| TrAP|TREATMENT| 643| 653| gemfibrozil| PROBLEM| 659| 661| HTG|0.97676986| +| TeRP| TEST| 739| 758|Physical examination| PROBLEM| 796| 810| dry oral mucosa| 0.9983334| +| TeRP| TEST| 830| 854|her abdominal exa...| PROBLEM| 875| 884| tenderness|0.99468285| +| TeRP| TEST| 830| 854|her abdominal exa...| PROBLEM| 888| 895| guarding| 0.9940719| +| TeRP| TEST| 830| 854|her abdominal exa...| PROBLEM| 902| 909| rigidity|0.99489564| +| TeRP| TEST| 1246| 1258| blood samples| PROBLEM| 1283| 1301| significant lipemia|0.76421493| +| TeRP| TEST| 1444| 1458| serum chemistry| PROBLEM| 1553| 1566| still elevated| 0.9956291| +| TeRP| TEST| 1507| 1517| her glucose| PROBLEM| 1553| 1566| still elevated|0.97471684| +| TeRP| TEST| 1535| 1547| the anion gap| PROBLEM| 1553| 1566| still elevated|0.99222517| +| TeRP| PROBLEM| 1553| 1566| still elevated| TEST| 1576| 1592| serum bicarbonate|0.97230035| +| TeRP| PROBLEM| 1553| 1566| still elevated| TEST| 1610| 1627| triglyceride level|0.96121335| ++--------+---------+-------------+-----------+--------------------+---------+-------------+-----------+--------------------+----------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|redl_clinical_biobert| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|401.7 MB| + +## References + +Trained on 2010 i2b2 relation challenge. + +## Benchmarking + +```bash +label Recall Precision F1 Support +PIP 0.859 0.878 0.869 1435 +TeCP 0.629 0.782 0.697 337 +TeRP 0.903 0.929 0.916 2034 +TrAP 0.872 0.866 0.869 1693 +TrCP 0.641 0.677 0.659 340 +TrIP 0.517 0.796 0.627 151 +TrNAP 0.402 0.672 0.503 112 +TrWP 0.257 0.824 0.392 109 +Avg. 0.635 0.803 0.691 - +``` diff --git a/docs/_posts/Meryem1425/2023-01-14-redl_date_clinical_biobert_en.md b/docs/_posts/Meryem1425/2023-01-14-redl_date_clinical_biobert_en.md new file mode 100644 index 00000000000000..8d59fe7397f786 --- /dev/null +++ b/docs/_posts/Meryem1425/2023-01-14-redl_date_clinical_biobert_en.md @@ -0,0 +1,193 @@ +--- +layout: model +title: Relation Extraction Between Dates and Clinical Entities (ReDL) +author: John Snow Labs +name: redl_date_clinical_biobert +date: 2023-01-14 +tags: [licensed, en, clinical, relation_extraction, tensorflow] +task: Relation Extraction +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Identify if tests were conducted on a particular date or any diagnosis was made on a specific date by checking relations between clinical entities and dates. 1 : Shows date and the clinical entity are related, 0 : Shows date and the clinical entity are not related. + +## Predicted Entities + +`1`, `0` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/RE_CLINICAL_DATE/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.1.Clinical_Relation_Extraction_BodyParts_Models.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/redl_date_clinical_biobert_en_4.2.4_3.0_1673731277460.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documenter = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentencer = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentences") + +tokenizer = Tokenizer()\ + .setInputCols(["sentences"])\ + .setOutputCol("tokens") + +words_embedder = WordEmbeddingsModel()\ + .pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("embeddings") + +pos_tagger = PerceptronModel()\ + .pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("pos_tags") + +events_ner_tagger = MedicalNerModel.pretrained("ner_events_clinical", "en", "clinical/models")\ + .setInputCols("sentences", "tokens", "embeddings")\ + .setOutputCol("ner_tags") + +ner_chunker = NerConverterInternal()\ + .setInputCols(["sentences", "tokens", "ner_tags"])\ + .setOutputCol("ner_chunks") + +dependency_parser = DependencyParserModel() \ + .pretrained("dependency_conllu", "en") \ + .setInputCols(["sentences", "pos_tags", "tokens"]) \ + .setOutputCol("dependencies") + +events_re_ner_chunk_filter = RENerChunksFilter() \ + .setInputCols(["ner_chunks", "dependencies"])\ + .setOutputCol("re_ner_chunks") + +events_re_Model = RelationExtractionDLModel() \ + .pretrained('redl_date_clinical_biobert', "en", "clinical/models")\ + .setPredictionThreshold(0.5)\ + .setInputCols(["re_ner_chunks", "sentences"]) \ + .setOutputCol("relations") + + +pipeline = Pipeline(stages=[ + documenter, + sentencer, + tokenizer, + words_embedder, + pos_tagger, + events_ner_tagger, + ner_chunker, + dependency_parser, + events_re_ner_chunk_filter, + events_re_Model]) + +data = spark.createDataFrame([['''This 73 y/o patient had CT on 1/12/95, with progressive memory and cognitive decline since 8/11/94.''']]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentencer = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentences") + +val tokenizer = new Tokenizer() + .setInputCols("sentences") + .setOutputCol("tokens") + +val words_embedder = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val pos_tagger = PerceptronModel() + .pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("pos_tags") + +val events_ner_tagger = MedicalNerModel.pretrained("ner_events_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens", "embeddings")) + .setOutputCol("ner_tags") + +val ner_chunker = new NerConverterInternal() + .setInputCols(Array("sentences", "tokens", "ner_tags")) + .setOutputCol("ner_chunks") + +val dependency_parser = DependencyParserModel() + .pretrained("dependency_conllu", "en") + .setInputCols(Array("sentences", "pos_tags", "tokens")) + .setOutputCol("dependencies") + +val events_re_ner_chunk_filter = new RENerChunksFilter() + .setInputCols(Array("ner_chunks", "dependencies")) + .setOutputCol("re_ner_chunks") + +val events_re_Model = RelationExtractionDLModel() + .pretrained("redl_date_clinical_biobert", "en", "clinical/models") + .setPredictionThreshold(0.5) + .setInputCols(Array("re_ner_chunks", "sentences")) + .setOutputCol("relations") + +val pipeline = new Pipeline().setStages(Array(documenter,sentencer,tokenizer,words_embedder,pos_tagger,events_ner_tagger,ner_chunker,dependency_parser,events_re_ner_chunk_filter,events_re_Model)) + +val data = Seq("This 73 y/o patient had CT on 1/12/95, with progressive memory and cognitive decline since 8/11/94.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++--------+-------+-------------+-----------+--------------------+-------+-------------+-----------+--------------------+----------+ +|relation|entity1|entity1_begin|entity1_end| chunk1|entity2|entity2_begin|entity2_end| chunk2|confidence| ++--------+-------+-------------+-----------+--------------------+-------+-------------+-----------+--------------------+----------+ +| 1| TEST| 24| 25| CT| DATE| 30| 36| 1/12/95|0.99997973| +| 1| TEST| 24| 25| CT|PROBLEM| 44| 83|progressive memor...| 0.9998983| +| 1| TEST| 24| 25| CT| DATE| 91| 97| 8/11/94| 0.9997316| +| 1| DATE| 30| 36| 1/12/95|PROBLEM| 44| 83|progressive memor...| 0.9998915| +| 1| DATE| 30| 36| 1/12/95| DATE| 91| 97| 8/11/94| 0.9997931| +| 1|PROBLEM| 44| 83|progressive memor...| DATE| 91| 97| 8/11/94| 0.9998667| ++--------+-------+-------------+-----------+--------------------+-------+-------------+-----------+--------------------+----------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|redl_date_clinical_biobert| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|401.7 MB| + +## References + +Trained on an internal dataset. + +## Benchmarking + +```bash +label Recall Precision F1 Support +0 0.738 0.729 0.734 84 +1 0.945 0.947 0.946 416 +Avg. 0.841 0.838 0.840 - +``` diff --git a/docs/_posts/Meryem1425/2023-01-14-redl_drug_drug_interaction_biobert_en.md b/docs/_posts/Meryem1425/2023-01-14-redl_drug_drug_interaction_biobert_en.md new file mode 100644 index 00000000000000..3d630619b3c22d --- /dev/null +++ b/docs/_posts/Meryem1425/2023-01-14-redl_drug_drug_interaction_biobert_en.md @@ -0,0 +1,196 @@ +--- +layout: model +title: Extract relations between effects of using multiple drugs (ReDL) +author: John Snow Labs +name: redl_drug_drug_interaction_biobert +date: 2023-01-14 +tags: [relation_extraction, en, licensed, clinical, tensorflow] +task: Relation Extraction +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Extract potential improvements or harmful effects of Drug-Drug interactions (DDIs) when two or more drugs are taken at the same time or at a certain interval. + +## Predicted Entities + +`DDI-advise`, `DDI-effect`, `DDI-false`, `DDI-int`, `DDI-mechanism` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/RE_DRUG_DRUG_INT/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.Clinical_Relation_Extraction.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/redl_drug_drug_interaction_biobert_en_4.2.4_3.0_1673734887835.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documenter = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentencer = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentences") + +tokenizer = Tokenizer()\ + .setInputCols(["sentences"])\ + .setOutputCol("tokens") + +pos_tagger = PerceptronModel()\ + .pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("pos_tags") + +words_embedder = WordEmbeddingsModel() \ + .pretrained("embeddings_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"]) \ + .setOutputCol("embeddings") + +ner_tagger = MedicalNerModel.pretrained("ner_posology", "en", "clinical/models")\ + .setInputCols("sentences", "tokens", "embeddings")\ + .setOutputCol("ner_tags") + +ner_converter = NerConverterInternal() \ + .setInputCols(["sentences", "tokens", "ner_tags"]) \ + .setOutputCol("ner_chunks") + +dependency_parser = DependencyParserModel() \ + .pretrained("dependency_conllu", "en") \ + .setInputCols(["sentences", "pos_tags", "tokens"]) \ + .setOutputCol("dependencies") + +# Set a filter on pairs of named entities which will be treated as relation candidates +re_ner_chunk_filter = RENerChunksFilter() \ + .setInputCols(["ner_chunks", "dependencies"])\ + .setMaxSyntacticDistance(10)\ + .setOutputCol("re_ner_chunks") + #.setRelationPairs(['SYMPTOM-EXTERNAL_BODY_PART_OR_REGION']) + +# The dataset this model is trained to is sentence-wise. +# This model can also be trained on document-level relations - in which case, while predicting, use "document" instead of "sentence" as input. +re_model = RelationExtractionDLModel()\ + .pretrained('redl_drug_drug_interaction_biobert', 'en', "clinical/models") \ + .setPredictionThreshold(0.5)\ + .setInputCols(["re_ner_chunks", "sentences"]) \ + .setOutputCol("relations") + +pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, pos_tagger, words_embedder, ner_tagger, ner_converter, dependency_parser, re_ner_chunk_filter, re_model]) + +text="""When carbamazepine is withdrawn from the combination therapy, aripiprazole dose should then be reduced. \ +If additional adrenergic drugs are to be administered by any route, \ +they should be used with caution because the pharmacologically predictable sympathetic effects of Metformin may be potentiated""" + +data = spark.createDataFrame([[text]]).toDF("text") + +p_model = pipeline.fit(data) + +result = p_model.transform(data) +``` +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentencer = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentences") + +val tokenizer = new Tokenizer() + .setInputCols("sentences") + .setOutputCol("tokens") + +val pos_tagger = PerceptronModel() + .pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("pos_tags") + +val words_embedder = WordEmbeddingsModel() + .pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("embeddings") + +val ner_tagger = MedicalNerModel.pretrained("ner_posology", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens", "embeddings")) + .setOutputCol("ner_tags") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentences", "tokens", "ner_tags")) + .setOutputCol("ner_chunks") + +val dependency_parser = DependencyParserModel() + .pretrained("dependency_conllu", "en") + .setInputCols(Array("sentences", "pos_tags", "tokens")) + .setOutputCol("dependencies") + +// Set a filter on pairs of named entities which will be treated as relation candidates +val re_ner_chunk_filter = new RENerChunksFilter() + .setInputCols(Array("ner_chunks", "dependencies")) + .setMaxSyntacticDistance(10) + .setOutputCol("re_ner_chunks") + // .setRelationPairs(Array('SYMPTOM-EXTERNAL_BODY_PART_OR_REGION')) + +// The dataset this model is trained to is sentence-wise. +// This model can also be trained on document-level relations - in which case, while predicting, use "document" instead of "sentence" as input. +val re_model = RelationExtractionDLModel() + .pretrained("redl_drug_drug_interaction_biobert", "en", "clinical/models") + .setPredictionThreshold(0.5) + .setInputCols(Array("re_ner_chunks", "sentences")) + .setOutputCol("relations") + +val pipeline = new Pipeline().setStages(Array(documenter, sentencer, tokenizer, pos_tagger, words_embedder, ner_tagger, ner_converter, dependency_parser, re_ner_chunk_filter, re_model)) + +val data = Seq("""When carbamazepine is withdrawn from the combination therapy, aripiprazole dose should then be reduced. If additional adrenergic drugs are to be administered by any route, they should be used with caution because the pharmacologically predictable sympathetic effects of Metformin may be potentiated""").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++---------+-------+-------------+-----------+-------------+-------+-------------+-----------+------------+----------+ +| relation|entity1|entity1_begin|entity1_end| chunk1|entity2|entity2_begin|entity2_end| chunk2|confidence| ++---------+-------+-------------+-----------+-------------+-------+-------------+-----------+------------+----------+ +|DDI-false| DRUG| 5| 17|carbamazepine| DRUG| 62| 73|aripiprazole|0.91685396| ++---------+-------+-------------+-----------+-------------+-------+-------------+-----------+------------+----------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|redl_drug_drug_interaction_biobert| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|401.7 MB| + +## References + +Trained on DDI Extraction corpus. + +## Benchmarking + +```bash +label Recall Precision F1 Support +DDI-advise 0.758 0.874 0.812 211 +DDI-effect 0.759 0.754 0.756 348 +DDI-false 0.977 0.957 0.967 4097 +DDI-int 0.175 0.458 0.253 63 +DDI-mechanism 0.783 0.853 0.816 281 +Avg. 0.690 0.779 0.721 - +``` diff --git a/docs/_posts/Meryem1425/2023-01-14-redl_drugprot_biobert_en.md b/docs/_posts/Meryem1425/2023-01-14-redl_drugprot_biobert_en.md new file mode 100644 index 00000000000000..0aed2d8f35f773 --- /dev/null +++ b/docs/_posts/Meryem1425/2023-01-14-redl_drugprot_biobert_en.md @@ -0,0 +1,219 @@ +--- +layout: model +title: Extract relations between drugs and proteins (ReDL) +author: John Snow Labs +name: redl_drugprot_biobert +date: 2023-01-14 +tags: [relation_extraction, clinical, en, licensed, tensorflow] +task: Relation Extraction +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Detect interactions between chemical compounds/drugs and genes/proteins using BERT by classifying whether a specified semantic relation holds between a chemical and gene entities within a sentence or document. The entity labels used during training were derived from the custom NER model created by our team for the DrugProt corpus. These include CHEMICAL for chemical compounds/drugs, GENE for genes/proteins and GENE_AND_CHEMICAL for entity mentions of type GENE and of type CHEMICAL that overlap (such as enzymes and small peptides). The relation categories from the DrugProt corpus were condensed from 13 categories to 10 categories due to low numbers of examples for certain categories. This merging process involved grouping the SUBSTRATE_PRODUCT-OF and SUBSTRATE relation categories together and grouping the AGONIST-ACTIVATOR, AGONIST-INHIBITOR and AGONIST relation categories together. + +## Predicted Entities + +`INHIBITOR`, `DIRECT-REGULATOR`, `SUBSTRATE`, `ACTIVATOR`, `INDIRECT-UPREGULATOR`, `INDIRECT-DOWNREGULATOR`, `ANTAGONIST`, `PRODUCT-OF`, `PART-OF`, `AGONIST` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.Clinical_Relation_Extraction.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/redl_drugprot_biobert_en_4.2.4_3.0_1673736326031.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + +In the table below, `redl_drugprot_biobert` RE model, its labels, optimal NER model, and meaningful relation pairs are illustrated. + + +| RE MODEL | RE MODEL LABES | NER MODEL | RE PAIRS | +|:---------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------:|------------------------------------------------------------------------------------| +| redl_drugprot_biobert | INHIBITOR,
DIRECT-REGULATOR,
SUBSTRATE,
ACTIVATOR,
INDIRECT-UPREGULATOR,
INDIRECT-DOWNREGULATOR,
ANTAGONIST,
PRODUCT-OF,
PART-OF,
AGONIST | ner_drugprot_clinical | [“checmical-gene”,
“chemical-gene_and_chemical”,
“gene_and_chemical-gene”] | + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documenter = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentencer = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentences") + +tokenizer = Tokenizer()\ + .setInputCols(["sentences"])\ + .setOutputCol("tokens") + +words_embedder = WordEmbeddingsModel()\ + .pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("embeddings") + +drugprot_ner_tagger = MedicalNerModel.pretrained("ner_drugprot_clinical", "en", "clinical/models")\ + .setInputCols("sentences", "tokens", "embeddings")\ + .setOutputCol("ner_tags") + +ner_converter = NerConverterInternal()\ + .setInputCols(["sentences", "tokens", "ner_tags"])\ + .setOutputCol("ner_chunks") + +pos_tagger = PerceptronModel()\ + .pretrained("pos_clinical", "en", "clinical/models")\ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("pos_tags") + +dependency_parser = DependencyParserModel()\ + .pretrained("dependency_conllu", "en")\ + .setInputCols(["sentences", "pos_tags", "tokens"])\ + .setOutputCol("dependencies") + +# Set a filter on pairs of named entities which will be treated as relation candidates +drugprot_re_ner_chunk_filter = RENerChunksFilter()\ + .setInputCols(["ner_chunks", "dependencies"])\ + .setOutputCol("re_ner_chunks")\ + .setMaxSyntacticDistance(4) + # .setRelationPairs(['CHEMICAL-GENE']) + +drugprot_re_Model = RelationExtractionDLModel()\ + .pretrained('redl_drugprot_biobert', "en", "clinical/models")\ + .setPredictionThreshold(0.9)\ + .setInputCols(["re_ner_chunks", "sentences"])\ + .setOutputCol("relations") + +pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, words_embedder, drugprot_ner_tagger, ner_converter, pos_tagger, dependency_parser, drugprot_re_ner_chunk_filter, drugprot_re_Model]) + +text='''Lipid specific activation of the murine P4-ATPase Atp8a1 (ATPase II). The asymmetric transbilayer distribution of phosphatidylserine (PS) in the mammalian plasma membrane and secretory vesicles is maintained, in part, by an ATP-dependent transporter. This aminophospholipid "flippase" selectively transports PS to the cytosolic leaflet of the bilayer and is sensitive to vanadate, Ca(2+), and modification by sulfhydryl reagents. Although the flippase has not been positively identified, a subfamily of P-type ATPases has been proposed to function as transporters of amphipaths, including PS and other phospholipids. A candidate PS flippase ATP8A1 (ATPase II), originally isolated from bovine secretory vesicles, is a member of this subfamily based on sequence homology to the founding member of the subfamily, the yeast protein Drs2, which has been linked to ribosomal assembly, the formation of Golgi-coated vesicles, and the maintenance of PS asymmetry. To determine if ATP8A1 has biochemical characteristics consistent with a PS flippase, a murine homologue of this enzyme was expressed in insect cells and purified. The purified Atp8a1 is inactive in detergent micelles or in micelles containing phosphatidylcholine, phosphatidic acid, or phosphatidylinositol, is minimally activated by phosphatidylglycerol or phosphatidylethanolamine (PE), and is maximally activated by PS. The selectivity for PS is dependent upon multiple elements of the lipid structure. Similar to the plasma membrane PS transporter, Atp8a1 is activated only by the naturally occurring sn-1,2-glycerol isomer of PS and not the sn-2,3-glycerol stereoisomer. Both flippase and Atp8a1 activities are insensitive to the stereochemistry of the serine headgroup. Most modifications of the PS headgroup structure decrease recognition by the plasma membrane PS flippase. Activation of Atp8a1 is also reduced by these modifications; phosphatidylserine-O-methyl ester, lysophosphatidylserine, glycerophosphoserine, and phosphoserine, which are not transported by the plasma membrane flippase, do not activate Atp8a1. Weakly translocated lipids (PE, phosphatidylhydroxypropionate, and phosphatidylhomoserine) are also weak Atp8a1 activators. However, N-methyl-phosphatidylserine, which is transported by the plasma membrane flippase at a rate equivalent to PS, is incapable of activating Atp8a1 activity. These results indicate that the ATPase activity of the secretory granule Atp8a1 is activated by phospholipids binding to a specific site whose properties (PS selectivity, dependence upon glycerol but not serine, stereochemistry, and vanadate sensitivity) are similar to, but distinct from, the properties of the substrate binding site of the plasma membrane flippase.''' + +data = spark.createDataFrame([[text]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentencer = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentences") + +val tokenizer = new Tokenizer() + .setInputCols("sentences") + .setOutputCol("tokens") + +val words_embedder = WordEmbeddingsModel() + .pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("embeddings") + +val drugprot_ner_tagger = MedicalNerModel.pretrained("ner_drugprot_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens", "embeddings")) + .setOutputCol("ner_tags") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentences", "tokens", "ner_tags")) + .setOutputCol("ner_chunks") + +val pos_tagger = PerceptronModel() + .pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("pos_tags") + +val dependency_parser = DependencyParserModel() + .pretrained("dependency_conllu", "en") + .setInputCols(Array("sentences", "pos_tags", "tokens")) + .setOutputCol("dependencies") + +// Set a filter on pairs of named entities which will be treated as relation candidates +val drugprot_re_ner_chunk_filter = new RENerChunksFilter() + .setInputCols(Array("ner_chunks", "dependencies")) + .setMaxSyntacticDistance(10) + .setOutputCol("re_ner_chunks") + // .setRelationPairs(Array("CHEMICAL-GENE")) + +// This model can also be trained on document-level relations - in which case, while predicting, use "document" instead of "sentence" as input. +val drugprot_re_Model = RelationExtractionDLModel() + .pretrained("redl_drugprot_biobert", "en", "clinical/models") + .setPredictionThreshold(0.9) + .setInputCols(Array("re_ner_chunks", "sentences")) + .setOutputCol("relations") + +val pipeline = new Pipeline().setStages(Array(documenter, sentencer, tokenizer, words_embedder, drugprot_ner_tagger, ner_converter, pos_tagger, dependency_parser, drugprot_re_ner_chunk_filter, drugprot_re_Model)) + +val data = Seq("""Lipid specific activation of the murine P4-ATPase Atp8a1 (ATPase II). The asymmetric transbilayer distribution of phosphatidylserine (PS) in the mammalian plasma membrane and secretory vesicles is maintained, in part, by an ATP-dependent transporter. This aminophospholipid "flippase" selectively transports PS to the cytosolic leaflet of the bilayer and is sensitive to vanadate, Ca(2+), and modification by sulfhydryl reagents. Although the flippase has not been positively identified, a subfamily of P-type ATPases has been proposed to function as transporters of amphipaths, including PS and other phospholipids. A candidate PS flippase ATP8A1 (ATPase II), originally isolated from bovine secretory vesicles, is a member of this subfamily based on sequence homology to the founding member of the subfamily, the yeast protein Drs2, which has been linked to ribosomal assembly, the formation of Golgi-coated vesicles, and the maintenance of PS asymmetry. To determine if ATP8A1 has biochemical characteristics consistent with a PS flippase, a murine homologue of this enzyme was expressed in insect cells and purified. The purified Atp8a1 is inactive in detergent micelles or in micelles containing phosphatidylcholine, phosphatidic acid, or phosphatidylinositol, is minimally activated by phosphatidylglycerol or phosphatidylethanolamine (PE), and is maximally activated by PS. The selectivity for PS is dependent upon multiple elements of the lipid structure. Similar to the plasma membrane PS transporter, Atp8a1 is activated only by the naturally occurring sn-1,2-glycerol isomer of PS and not the sn-2,3-glycerol stereoisomer. Both flippase and Atp8a1 activities are insensitive to the stereochemistry of the serine headgroup. Most modifications of the PS headgroup structure decrease recognition by the plasma membrane PS flippase. Activation of Atp8a1 is also reduced by these modifications; phosphatidylserine-O-methyl ester, lysophosphatidylserine, glycerophosphoserine, and phosphoserine, which are not transported by the plasma membrane flippase, do not activate Atp8a1. Weakly translocated lipids (PE, phosphatidylhydroxypropionate, and phosphatidylhomoserine) are also weak Atp8a1 activators. However, N-methyl-phosphatidylserine, which is transported by the plasma membrane flippase at a rate equivalent to PS, is incapable of activating Atp8a1 activity. These results indicate that the ATPase activity of the secretory granule Atp8a1 is activated by phospholipids binding to a specific site whose properties (PS selectivity, dependence upon glycerol but not serine, stereochemistry, and vanadate sensitivity) are similar to, but distinct from, the properties of the substrate binding site of the plasma membrane flippase.""").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++---------+-----------------+-------------+-----------+--------------------+-----------------+-------------+-----------+--------------------+----------+ +| relation| entity1|entity1_begin|entity1_end| chunk1| entity2|entity2_begin|entity2_end| chunk2|confidence| ++---------+-----------------+-------------+-----------+--------------------+-----------------+-------------+-----------+--------------------+----------+ +|ACTIVATOR| GENE| 33| 48| murine P4-ATPase| GENE| 50| 55| Atp8a1|0.95415354| +|ACTIVATOR| GENE| 50| 55| Atp8a1| GENE| 58| 66| ATPase II| 0.9600417| +|SUBSTRATE| CHEMICAL| 114| 131| phosphatidylserine|GENE_AND_CHEMICAL| 224| 248|ATP-dependent tra...| 0.9931178| +|SUBSTRATE| CHEMICAL| 134| 135| PS|GENE_AND_CHEMICAL| 224| 248|ATP-dependent tra...| 0.9978284| +|SUBSTRATE|GENE_AND_CHEMICAL| 256| 282|aminophospholipid...| CHEMICAL| 308| 309| PS| 0.9968598| +|SUBSTRATE| GENE| 443| 450| flippase| CHEMICAL| 589| 590| PS| 0.9991992| +|ACTIVATOR| CHEMICAL| 1201| 1219| phosphatidylcholine| CHEMICAL| 1222| 1238| phosphatidic acid|0.96227807| +|ACTIVATOR| CHEMICAL| 1244| 1263|phosphatidylinositol| CHEMICAL| 1292| 1311|phosphatidylglycerol|0.93301487| +|ACTIVATOR| CHEMICAL| 1244| 1263|phosphatidylinositol| CHEMICAL| 1316| 1339|phosphatidylethan...|0.93579245| +|ACTIVATOR| CHEMICAL| 1292| 1311|phosphatidylglycerol| CHEMICAL| 1316| 1339|phosphatidylethan...| 0.9583067| +|ACTIVATOR| CHEMICAL| 1292| 1311|phosphatidylglycerol| CHEMICAL| 1342| 1343| PE| 0.9603738| +|ACTIVATOR| CHEMICAL| 1316| 1339|phosphatidylethan...| CHEMICAL| 1342| 1343| PE| 0.9596611| +|ACTIVATOR| CHEMICAL| 1316| 1339|phosphatidylethan...| CHEMICAL| 1377| 1378| PS| 0.9832381| +|ACTIVATOR| CHEMICAL| 1342| 1343| PE| CHEMICAL| 1377| 1378| PS| 0.981709| +|ACTIVATOR| GENE| 1511| 1516| Atp8a1| CHEMICAL| 1563| 1577| sn-1,2-glycerol|0.99146277| +|ACTIVATOR| GENE| 1511| 1516| Atp8a1| CHEMICAL| 1589| 1590| PS| 0.9842391| +|ACTIVATOR| GENE| 1511| 1516| Atp8a1| CHEMICAL| 1604| 1618| sn-2,3-glycerol|0.98676455| +| PART-OF| GENE| 1639| 1646| flippase| CHEMICAL| 1716| 1721| serine| 0.9470919| +|SUBSTRATE| CHEMICAL| 1936| 1957|lysophosphatidyls...| GENE| 2050| 2057| flippase|0.98919815| +|SUBSTRATE| CHEMICAL| 1960| 1979|glycerophosphoserine| GENE| 2050| 2057| flippase| 0.9857248| ++---------+-----------------+-------------+-----------+--------------------+-----------------+-------------+-----------+--------------------+----------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|redl_drugprot_biobert| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|401.7 MB| + +## References + +This model was trained on the DrugProt corpus. + +## Benchmarking + +```bash +label recall precision f1 support +ACTIVATOR 0.885 0.776 0.827 235 +AGONIST 0.810 0.925 0.864 137 +ANTAGONIST 0.970 0.919 0.944 199 +DIRECT-REGULATOR 0.836 0.901 0.867 403 +INDIRECT-DOWNREGULATOR 0.885 0.850 0.867 313 +INDIRECT-UPREGULATOR 0.844 0.887 0.865 270 +INHIBITOR 0.947 0.937 0.942 1083 +PART-OF 0.939 0.889 0.913 247 +PRODUCT-OF 0.697 0.953 0.805 145 +SUBSTRATE 0.912 0.884 0.898 468 +Avg 0.873 0.892 0.879 - +Weighted-Avg 0.897 0.899 0.897 - +``` \ No newline at end of file diff --git a/docs/_posts/Meryem1425/2023-01-14-redl_human_phenotype_gene_biobert_en.md b/docs/_posts/Meryem1425/2023-01-14-redl_human_phenotype_gene_biobert_en.md new file mode 100644 index 00000000000000..b78bc15646ab79 --- /dev/null +++ b/docs/_posts/Meryem1425/2023-01-14-redl_human_phenotype_gene_biobert_en.md @@ -0,0 +1,203 @@ +--- +layout: model +title: Extract relations between phenotypic abnormalities and diseases (ReDL) +author: John Snow Labs +name: redl_human_phenotype_gene_biobert +date: 2023-01-14 +tags: [relation_extraction, en, licensed, clinical, tensorflow] +task: Relation Extraction +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Extract relations to fully understand the origin of some phenotypic abnormalities and their associated diseases. 1 : Entities are related, 0 : Entities are not related. + +## Predicted Entities + +`1`, `0` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.Clinical_Relation_Extraction.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/redl_human_phenotype_gene_biobert_en_4.2.4_3.0_1673737099610.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documenter = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentencer = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentences") + +tokenizer = Tokenizer()\ + .setInputCols(["sentences"])\ + .setOutputCol("tokens") + +pos_tagger = PerceptronModel()\ + .pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("pos_tags") + +words_embedder = WordEmbeddingsModel() \ + .pretrained("embeddings_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"]) \ + .setOutputCol("embeddings") + +ner_tagger = MedicalNerModel.pretrained("ner_human_phenotype_gene_clinical", "en", "clinical/models")\ + .setInputCols("sentences", "tokens", "embeddings")\ + .setOutputCol("ner_tags") + +ner_converter = NerConverterInternal() \ + .setInputCols(["sentences", "tokens", "ner_tags"]) \ + .setOutputCol("ner_chunks") + +dependency_parser = DependencyParserModel() \ + .pretrained("dependency_conllu", "en") \ + .setInputCols(["sentences", "pos_tags", "tokens"]) \ + .setOutputCol("dependencies") + +#Set a filter on pairs of named entities which will be treated as relation candidates +re_ner_chunk_filter = RENerChunksFilter() \ + .setInputCols(["ner_chunks", "dependencies"])\ + .setMaxSyntacticDistance(10)\ + .setOutputCol("re_ner_chunks") + +# The dataset this model is trained to is sentence-wise. +# This model can also be trained on document-level relations - in which case, while predicting, use "document" instead of "sentence" as input. +re_model = RelationExtractionDLModel()\ + .pretrained('redl_human_phenotype_gene_biobert', 'en', "clinical/models") \ + .setPredictionThreshold(0.5)\ + .setInputCols(["re_ner_chunks", "sentences"]) \ + .setOutputCol("relations") + +pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, pos_tagger, words_embedder, ner_tagger, ner_converter, dependency_parser, re_ner_chunk_filter, re_model]) + +text = """She has a retinal degeneration, hearing loss and renal failure, short stature, Mutations in the SH3PXD2B gene coding for the Tks4 protein are responsible for the autosomal recessive.""" + +data = spark.createDataFrame([[text]]).toDF("text") + +p_model = pipeline.fit(data) + +result = p_model.transform(data) +``` +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentencer = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentences") + +val tokenizer = new Tokenizer() + .setInputCols("sentences") + .setOutputCol("tokens") + +val pos_tagger = PerceptronModel() + .pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("pos_tags") + +val words_embedder = WordEmbeddingsModel() + .pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("embeddings") + +val ner_tagger = MedicalNerModel.pretrained("ner_human_phenotype_gene_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens", "embeddings")) + .setOutputCol("ner_tags") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentences", "tokens", "ner_tags")) + .setOutputCol("ner_chunks") + +val dependency_parser = DependencyParserModel() + .pretrained("dependency_conllu", "en") + .setInputCols(Array("sentences", "pos_tags", "tokens")) + .setOutputCol("dependencies") + +// Set a filter on pairs of named entities which will be treated as relation candidates +val re_ner_chunk_filter = new RENerChunksFilter() + .setInputCols(Array("ner_chunks", "dependencies")) + .setMaxSyntacticDistance(10) + .setOutputCol("re_ner_chunks") + +// The dataset this model is trained to is sentence-wise. +// This model can also be trained on document-level relations - in which case, while predicting, use "document" instead of "sentence" as input. +val re_model = RelationExtractionDLModel() + .pretrained("redl_human_phenotype_gene_biobert", "en", "clinical/models") + .setPredictionThreshold(0.5) + .setInputCols(Array("re_ner_chunks", "sentences")) + .setOutputCol("relations") + +val pipeline = new Pipeline().setStages(Array(documenter, sentencer, tokenizer, pos_tagger, words_embedder, ner_tagger, ner_converter, dependency_parser, re_ner_chunk_filter, re_model)) + +val data = Seq("""She has a retinal degeneration, hearing loss and renal failure, short stature, Mutations in the SH3PXD2B gene coding for the Tks4 protein are responsible for the autosomal recessive.""").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++--------+-------+-------------+-----------+--------------------+-------+-------------+-----------+-------------------+----------+ +|relation|entity1|entity1_begin|entity1_end| chunk1|entity2|entity2_begin|entity2_end| chunk2|confidence| ++--------+-------+-------------+-----------+--------------------+-------+-------------+-----------+-------------------+----------+ +| 0| HP| 10| 29|retinal degeneration| HP| 32| 43| hearing loss|0.92880034| +| 0| HP| 10| 29|retinal degeneration| HP| 49| 61| renal failure|0.93935645| +| 0| HP| 10| 29|retinal degeneration| HP| 64| 76| short stature|0.92370766| +| 1| HP| 10| 29|retinal degeneration| GENE| 96| 103| SH3PXD2B|0.63739055| +| 1| HP| 10| 29|retinal degeneration| HP| 162| 180|autosomal recessive|0.58393383| +| 0| HP| 32| 43| hearing loss| HP| 49| 61| renal failure| 0.9543991| +| 0| HP| 32| 43| hearing loss| HP| 64| 76| short stature| 0.8060494| +| 1| HP| 32| 43| hearing loss| GENE| 96| 103| SH3PXD2B| 0.8507128| +| 1| HP| 32| 43| hearing loss| HP| 162| 180|autosomal recessive|0.90283227| +| 0| HP| 49| 61| renal failure| HP| 64| 76| short stature|0.85388213| +| 1| HP| 49| 61| renal failure| GENE| 96| 103| SH3PXD2B|0.76057386| +| 1| HP| 49| 61| renal failure| HP| 162| 180|autosomal recessive|0.85482293| +| 1| HP| 64| 76| short stature| GENE| 96| 103| SH3PXD2B| 0.8951201| +| 1| HP| 64| 76| short stature| HP| 162| 180|autosomal recessive| 0.9018232| +| 1| GENE| 96| 103| SH3PXD2B| HP| 162| 180|autosomal recessive|0.97185487| ++--------+-------+-------------+-----------+--------------------+-------+-------------+-----------+-------------------+----------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|redl_human_phenotype_gene_biobert| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|401.7 MB| + +## References + +Trained on a silver standard corpus of human phenotype and gene annotations and their relations. + +## Benchmarking + +```bash +label Recall Precision F1 Support +0 0.922 0.908 0.915 129 +1 0.831 0.855 0.843 71 +Avg. 0.877 0.882 0.879 - +``` \ No newline at end of file diff --git a/docs/_posts/Meryem1425/2023-01-15-redl_nihss_biobert_en.md b/docs/_posts/Meryem1425/2023-01-15-redl_nihss_biobert_en.md new file mode 100644 index 00000000000000..f9dbb0aa71f9e3 --- /dev/null +++ b/docs/_posts/Meryem1425/2023-01-15-redl_nihss_biobert_en.md @@ -0,0 +1,182 @@ +--- +layout: model +title: Extract relations between NIHSS entities +author: John Snow Labs +name: redl_nihss_biobert +date: 2023-01-15 +tags: [en, licensed, clinical, relation_extraction, tensorflow] +task: Relation Extraction +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Relate scale items and their measurements according to NIHSS guidelines. + +## Predicted Entities + +`Has_Value`, `0` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/RE_NIHSS/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.Clinical_Relation_Extraction.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/redl_nihss_biobert_en_4.2.4_3.0_1673762755276.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documenter = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentencer = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentences") + +tokenizer = sparknlp.annotators.Tokenizer()\ + .setInputCols(["sentences"])\ + .setOutputCol("tokens") + +pos_tagger = PerceptronModel().pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("pos_tags") + +words_embedder = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"]) \ + .setOutputCol("embeddings") + +ner_tagger = MedicalNerModel.pretrained("ner_nihss", "en", "clinical/models")\ + .setInputCols("sentences", "tokens", "embeddings")\ + .setOutputCol("ner_tags") + +ner_converter = NerConverterInternal() \ + .setInputCols(["sentences", "tokens", "ner_tags"]) \ + .setOutputCol("ner_chunks") + +dependency_parser = DependencyParserModel().pretrained("dependency_conllu", "en") \ + .setInputCols(["sentences", "pos_tags", "tokens"]) \ + .setOutputCol("dependencies") + +# Set a filter on pairs of named entities which will be treated as relation candidates +re_ner_chunk_filter = RENerChunksFilter() \ + .setInputCols(["ner_chunks", "dependencies"])\ + .setMaxSyntacticDistance(10)\ + .setOutputCol("re_ner_chunks") + +re_model = RelationExtractionDLModel().pretrained('redl_nihss_biobert', 'en', "clinical/models") \ + .setPredictionThreshold(0.5)\ + .setInputCols(["re_ner_chunks", "sentences"]) \ + .setOutputCol("relations") + +pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, pos_tagger, words_embedder, ner_tagger, ner_converter, dependency_parser, re_ner_chunk_filter, re_model]) + +text= "There , her initial NIHSS score was 4 , as recorded by the ED physicians . This included 2 for weakness in her left leg and 2 for what they felt was subtle ataxia in her left arm and leg ." + +p_model = pipeline.fit(spark.createDataFrame([[text]]).toDF("text")) + +result = p_model.transform(spark.createDataFrame(pd.DataFrame({'text': [text]}))) +``` +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentencer = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentences") + +val tokenizer = new Tokenizer() + .setInputCols("sentences") + .setOutputCol("tokens") + +val pos_tagger = PerceptronModel().pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("pos_tags") + +val words_embedder = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("embeddings") + +val ner_tagger = MedicalNerModel.pretrained("ner_nihss", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens", "embeddings")) + .setOutputCol("ner_tags") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentences", "tokens", "ner_tags")) + .setOutputCol("ner_chunks") + +val dependency_parser = DependencyParserModel().pretrained("dependency_conllu", "en") + .setInputCols(Array("sentences", "pos_tags", "tokens")) + .setOutputCol("dependencies") + +// Set a filter on pairs of named entities which will be treated as relation candidates +val re_ner_chunk_filter = new RENerChunksFilter() + .setInputCols(Array("ner_chunks", "dependencies")) + .setMaxSyntacticDistance(10) + .setOutputCol("re_ner_chunks") + +val re_model = RelationExtractionDLModel().pretrained("redl_nihss_biobert", "en", "clinical/models") + .setPredictionThreshold(0.5) + .setInputCols(Array("re_ner_chunks", "sentences")) + .setOutputCol("relations") + +val pipeline = new Pipeline().setStages(Array(documenter, sentencer, tokenizer, pos_tagger, words_embedder, ner_tagger, ner_converter, dependency_parser, re_ner_chunk_filter, re_model)) + +val data = Seq("""There , her initial NIHSS score was 4 , as recorded by the ED physicians . This included 2 for weakness in her left leg and 2 for what they felt was subtle ataxia in her left arm and leg .""").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++---------+-----------+-------------+-----------+-----------+------------+-------------+-----------+--------------------+----------+ +| relation| entity1|entity1_begin|entity1_end| chunk1| entity2|entity2_begin|entity2_end| chunk2|confidence| ++---------+-----------+-------------+-----------+-----------+------------+-------------+-----------+--------------------+----------+ +|Has_Value| NIHSS| 20| 30|NIHSS score| Measurement| 36| 36| 4| 0.9998851| +|Has_Value|Measurement| 89| 89| 2| 6a_LeftLeg| 111| 118| left leg| 0.9987311| +| 0|Measurement| 89| 89| 2| Measurement| 124| 124| 2|0.97510725| +| 0|Measurement| 89| 89| 2|7_LimbAtaxia| 156| 185|ataxia in her lef...| 0.999889| +| 0| 6a_LeftLeg| 111| 118| left leg| Measurement| 124| 124| 2|0.99989617| +| 0| 6a_LeftLeg| 111| 118| left leg|7_LimbAtaxia| 156| 185|ataxia in her lef...| 0.9999521| +|Has_Value|Measurement| 124| 124| 2|7_LimbAtaxia| 156| 185|ataxia in her lef...| 0.9896319| ++---------+-----------+-------------+-----------+-----------+------------+-------------+-----------+--------------------+----------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|redl_nihss_biobert| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|401.7 MB| + +## References + +@article{wangnational, title={National Institutes of Health Stroke Scale (NIHSS) Annotations for the MIMIC-III Database}, author={Wang, Jiayang and Huang, Xiaoshuo and Yang, Lin and Li, Jiao} } + +## Benchmarking + +```bash +label Recall Precision F1 Support +0 0.989 0.976 0.982 611 +Has_Value 0.983 0.992 0.988 889 +Avg. 0.986 0.984 0.985 - +Weighted-Avg. 0.985 0.985 0.985 - +``` \ No newline at end of file diff --git a/docs/_posts/Meryem1425/2023-01-15-redl_oncology_biobert_wip_en.md b/docs/_posts/Meryem1425/2023-01-15-redl_oncology_biobert_wip_en.md new file mode 100644 index 00000000000000..c20725c0917da3 --- /dev/null +++ b/docs/_posts/Meryem1425/2023-01-15-redl_oncology_biobert_wip_en.md @@ -0,0 +1,191 @@ +--- +layout: model +title: Relation Extraction between different oncological entity types (ReDL) +author: John Snow Labs +name: redl_oncology_biobert_wip +date: 2023-01-15 +tags: [licensed, clinical, oncology, en, relation_extraction, temporal, test, biomarker, anatomy, tensorflow] +task: Relation Extraction +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This relation extraction model identifies relations between dates and other clinical entities, between tumor mentions and their size, between anatomical entities and other clinical entities, and between tests and their results. In contrast to re_oncology_granular, all these relation types are labeled as is_related_to. The different types of relations can be identified considering the pairs of entities that are linked. + +## Predicted Entities + +`is_related_to` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/RE_ONCOLOGY/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/redl_oncology_biobert_wip_en_4.2.4_3.0_1673763869198.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverterInternal() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +pos_tagger = PerceptronModel.pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("pos_tags") + +dependency_parser = DependencyParserModel.pretrained("dependency_conllu", "en") \ + .setInputCols(["sentence", "pos_tags", "token"]) \ + .setOutputCol("dependencies") + +re_ner_chunk_filter = RENerChunksFilter()\ + .setInputCols(["ner_chunk", "dependencies"])\ + .setOutputCol("re_ner_chunk")\ + .setMaxSyntacticDistance(10)\ + .setRelationPairs(["Tumor_Finding-Tumor_Size", "Tumor_Size-Tumor_Finding", "Cancer_Surgery-Relative_Date", "Relative_Date-Cancer_Surgery"]) + +re_model = RelationExtractionDLModel.pretrained("redl_oncology_biobert_wip", "en", "clinical/models")\ + .setInputCols(["re_ner_chunk", "sentence"])\ + .setOutputCol("relation_extraction") + +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter, + pos_tagger, + dependency_parser, + re_ner_chunk_filter, + re_model]) + +data = spark.createDataFrame([["A mastectomy was performed two months ago, and a 3 cm mass was extracted."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols(Array("document")) + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols(Array("sentence")) + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pos_tagger = PerceptronModel.pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("pos_tags") + +val dependency_parser = DependencyParserModel.pretrained("dependency_conllu", "en") + .setInputCols(Array("sentence", "pos_tags", "token")) + .setOutputCol("dependencies") + +val re_ner_chunk_filter = new RENerChunksFilter() + .setInputCols(Array("ner_chunk", "dependencies")) + .setOutputCol("re_ner_chunk") + .setMaxSyntacticDistance(10) + .setRelationPairs(Array("Tumor_Finding-Tumor_Size", "Tumor_Size-Tumor_Finding", "Cancer_Surgery-Relative_Date", "Relative_Date-Cancer_Surgery")) + +val re_model = RelationExtractionDLModel.pretrained("redl_oncology_biobert_wip", "en", "clinical/models") + .setPredictionThreshold(0.5f) + .setInputCols(Array("re_ner_chunk", "sentence")) + .setOutputCol("relation_extraction") + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter, + pos_tagger, + dependency_parser, + re_ner_chunk_filter, + re_model)) + +val data = Seq("A mastectomy was performed two months ago, and a 3 cm mass was extracted.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++-------------+--------------+-------------+-----------+----------+-------------+-------------+-----------+--------------+----------+ +| relation| entity1|entity1_begin|entity1_end| chunk1| entity2|entity2_begin|entity2_end| chunk2|confidence| ++-------------+--------------+-------------+-----------+----------+-------------+-------------+-----------+--------------+----------+ +|is_related_to|Cancer_Surgery| 2| 11|mastectomy|Relative_Date| 27| 40|two months ago|0.91422147| +|is_related_to| Tumor_Size| 49| 52| 3 cm|Tumor_Finding| 54| 57| mass|0.90398973| ++-------------+--------------+-------------+-----------+----------+-------------+-------------+-----------+--------------+----------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|redl_oncology_biobert_wip| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|401.7 MB| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label recall precision f1 + O 0.82 0.89 0.86 +is_related_to 0.90 0.84 0.87 + macro-avg 0.86 0.87 0.86 +``` diff --git a/docs/_posts/Meryem1425/2023-01-15-redl_oncology_biomarker_result_biobert_wip_en.md b/docs/_posts/Meryem1425/2023-01-15-redl_oncology_biomarker_result_biobert_wip_en.md new file mode 100644 index 00000000000000..d3e561613e01f6 --- /dev/null +++ b/docs/_posts/Meryem1425/2023-01-15-redl_oncology_biomarker_result_biobert_wip_en.md @@ -0,0 +1,196 @@ +--- +layout: model +title: Relation Extraction between Biomarkers and Results (ReDL) +author: John Snow Labs +name: redl_oncology_biomarker_result_biobert_wip +date: 2023-01-15 +tags: [licensed, clinical, oncology, en, relation_extraction, test, biomarker, tensorflow] +task: Relation Extraction +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This relation extraction model links Biomarker and Oncogene extractions to their corresponding Biomarker_Result extractions. + +## Predicted Entities + +`is_finding_of`, `O` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/RE_ONCOLOGY/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/redl_oncology_biomarker_result_biobert_wip_en_4.2.4_3.0_1673766618517.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + +Use relation pairs to include only the combinations of entities that are relevant in your case. + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverterInternal() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +pos_tagger = PerceptronModel.pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("pos_tags") + +dependency_parser = DependencyParserModel.pretrained("dependency_conllu", "en") \ + .setInputCols(["sentence", "pos_tags", "token"]) \ + .setOutputCol("dependencies") + +re_ner_chunk_filter = RENerChunksFilter()\ + .setInputCols(["ner_chunk", "dependencies"])\ + .setOutputCol("re_ner_chunk")\ + .setMaxSyntacticDistance(10)\ + .setRelationPairs(["Biomarker-Biomarker_Result", "Biomarker_Result-Biomarker", "Oncogene-Biomarker_Result", "Biomarker_Result-Oncogene"]) + +re_model = RelationExtractionDLModel.pretrained("redl_oncology_biomarker_result_biobert_wip", "en", "clinical/models")\ + .setInputCols(["re_ner_chunk", "sentence"])\ + .setOutputCol("relation_extraction") + +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter, + pos_tagger, + dependency_parser, + re_ner_chunk_filter, + re_model]) + +data = spark.createDataFrame([["Immunohistochemistry was negative for thyroid transcription factor-1 and napsin A. The test was positive for ER and PR, and negative for HER2."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols(Array("document")) + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols(Array("sentence")) + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pos_tagger = PerceptronModel.pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("pos_tags") + +val dependency_parser = DependencyParserModel.pretrained("dependency_conllu", "en") + .setInputCols(Array("sentence", "pos_tags", "token")) + .setOutputCol("dependencies") + +val re_ner_chunk_filter = new RENerChunksFilter() + .setInputCols(Array("ner_chunk", "dependencies")) + .setOutputCol("re_ner_chunk") + .setMaxSyntacticDistance(10) + .setRelationPairs(Array("Biomarker-Biomarker_Result", "Biomarker_Result-Biomarker", "Oncogene-Biomarker_Result", "Biomarker_Result-Oncogene")) + +val re_model = RelationExtractionDLModel.pretrained("redl_oncology_biomarker_result_biobert_wip", "en", "clinical/models") + .setInputCols(Array("re_ner_chunk", "sentence")) + .setOutputCol("relation_extraction") + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter, + pos_tagger, + dependency_parser, + re_ner_chunk_filter, + re_model)) + +val data = Seq("Immunohistochemistry was negative for thyroid transcription factor-1 and napsin A. The test was positive for ER and PR, and negative for HER2.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++-------------+----------------+-------------+-----------+--------+----------------+-------------+-----------+--------------------+----------+ +| relation| entity1|entity1_begin|entity1_end| chunk1| entity2|entity2_begin|entity2_end| chunk2|confidence| ++-------------+----------------+-------------+-----------+--------+----------------+-------------+-----------+--------------------+----------+ +|is_finding_of|Biomarker_Result| 25| 32|negative| Biomarker| 38| 67|thyroid transcrip...|0.99808085| +|is_finding_of|Biomarker_Result| 25| 32|negative| Biomarker| 73| 78| napsin|0.99637383| +|is_finding_of|Biomarker_Result| 96| 103|positive| Biomarker| 109| 110| ER|0.99221414| +|is_finding_of|Biomarker_Result| 96| 103|positive| Biomarker| 116| 117| PR| 0.9893672| +| O|Biomarker_Result| 96| 103|positive| Oncogene| 137| 140| HER2| 0.9986272| +| O| Biomarker| 109| 110| ER|Biomarker_Result| 124| 131| negative| 0.9999089| +| O| Biomarker| 116| 117| PR|Biomarker_Result| 124| 131| negative| 0.9998932| +|is_finding_of|Biomarker_Result| 124| 131|negative| Oncogene| 137| 140| HER2|0.98810333| ++-------------+----------------+-------------+-----------+--------+----------------+-------------+-----------+--------------------+----------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|redl_oncology_biomarker_result_biobert_wip| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|401.7 MB| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label recall precision f1 + O 0.93 0.97 0.95 +is_finding_of 0.97 0.93 0.95 + macro-avg 0.95 0.95 0.95 +``` diff --git a/docs/_posts/Meryem1425/2023-01-15-redl_oncology_granular_biobert_wip_en.md b/docs/_posts/Meryem1425/2023-01-15-redl_oncology_granular_biobert_wip_en.md new file mode 100644 index 00000000000000..0666cbf42a2829 --- /dev/null +++ b/docs/_posts/Meryem1425/2023-01-15-redl_oncology_granular_biobert_wip_en.md @@ -0,0 +1,194 @@ +--- +layout: model +title: Relation Extraction between different oncological entity types using granular classes (ReDL) +author: John Snow Labs +name: redl_oncology_granular_biobert_wip +date: 2023-01-15 +tags: [licensed, clinical, oncology, en, relation_extraction, temporal, test, biomarker, anatomy, tensorflow] +task: Relation Extraction +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Using this relation extraction model, four relation types can be identified: is_date_of (between date entities and other clinical entities), is_size_of (between Tumor_Finding and Tumor_Size), is_location_of (between anatomical entities and other entities) and is_finding_of (between test entities and their results). + +## Predicted Entities + +`is_date_of`, `is_finding_of`, `is_location_of`, `is_size_of`, `O` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/RE_ONCOLOGY/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/redl_oncology_granular_biobert_wip_en_4.2.4_3.0_1673768709402.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + +Use relation pairs to include only the combinations of entities that are relevant in your case. + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverterInternal() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +pos_tagger = PerceptronModel.pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("pos_tags") + +dependency_parser = DependencyParserModel.pretrained("dependency_conllu", "en") \ + .setInputCols(["sentence", "pos_tags", "token"]) \ + .setOutputCol("dependencies") + +re_ner_chunk_filter = RENerChunksFilter()\ + .setInputCols(["ner_chunk", "dependencies"])\ + .setOutputCol("re_ner_chunk")\ + .setMaxSyntacticDistance(10)\ + .setRelationPairs(["Tumor_Finding-Tumor_Size", "Tumor_Size-Tumor_Finding", "Cancer_Surgery-Relative_Date", "Relative_Date-Cancer_Surgery"]) + +re_model = RelationExtractionDLModel.pretrained("redl_oncology_granular_biobert_wip", "en", "clinical/models")\ + .setInputCols(["re_ner_chunk", "sentence"])\ + .setOutputCol("relation_extraction") + +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter, + pos_tagger, + dependency_parser, + re_ner_chunk_filter, + re_model]) + +data = spark.createDataFrame([["A mastectomy was performed two months ago, and a 3 cm mass was extracted."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols(Array("document")) + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols(Array("sentence")) + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pos_tagger = PerceptronModel.pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("pos_tags") + +val dependency_parser = DependencyParserModel.pretrained("dependency_conllu", "en") + .setInputCols(Array("sentence", "pos_tags", "token")) + .setOutputCol("dependencies") + +val re_ner_chunk_filter = new RENerChunksFilter() + .setInputCols(Array("ner_chunk", "dependencies")) + .setOutputCol("re_ner_chunk") + .setMaxSyntacticDistance(10) + .setRelationPairs(Array("Tumor_Finding-Tumor_Size", "Tumor_Size-Tumor_Finding", "Cancer_Surgery-Relative_Date", "Relative_Date-Cancer_Surgery")) + +val re_model = RelationExtractionDLModel.pretrained("redl_oncology_granular_biobert_wip", "en", "clinical/models") + .setPredictionThreshold(0.5f) + .setInputCols(Array("re_ner_chunk", "sentence")) + .setOutputCol("relation_extraction") + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter, + pos_tagger, + dependency_parser, + re_ner_chunk_filter, + re_model)) + +val data = Seq("A mastectomy was performed two months ago.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++----------+--------------+-------------+-----------+----------+-------------+-------------+-----------+--------------+----------+ +| relation| entity1|entity1_begin|entity1_end| chunk1| entity2|entity2_begin|entity2_end| chunk2|confidence| ++----------+--------------+-------------+-----------+----------+-------------+-------------+-----------+--------------+----------+ +|is_date_of|Cancer_Surgery| 2| 11|mastectomy|Relative_Date| 27| 40|two months ago| 0.9652523| +|is_size_of| Tumor_Size| 49| 52| 3 cm|Tumor_Finding| 54| 57| mass|0.81723577| ++----------+--------------+-------------+-----------+----------+-------------+-------------+-----------+--------------+----------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|redl_oncology_granular_biobert_wip| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|401.7 MB| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label recall precision f1 + O 0.83 0.91 0.87 + is_date_of 0.82 0.80 0.81 + is_finding_of 0.92 0.85 0.88 +is_location_of 0.95 0.85 0.90 + is_size_of 0.91 0.80 0.85 + macro-avg 0.89 0.84 0.86 +``` diff --git a/docs/_posts/Meryem1425/2023-01-15-redl_oncology_location_biobert_wip_en.md b/docs/_posts/Meryem1425/2023-01-15-redl_oncology_location_biobert_wip_en.md new file mode 100644 index 00000000000000..b86c8f226540a9 --- /dev/null +++ b/docs/_posts/Meryem1425/2023-01-15-redl_oncology_location_biobert_wip_en.md @@ -0,0 +1,190 @@ +--- +layout: model +title: Relation Extraction between anatomical entities and other clinical entities (ReDL) +author: John Snow Labs +name: redl_oncology_location_biobert_wip +date: 2023-01-15 +tags: [licensed, clinical, oncology, en, relation_extraction, anatomy, tensorflow] +task: Relation Extraction +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This relation extraction model links extractions from anatomical entities (such as Site_Breast or Site_Lung) to other clinical entities (such as Tumor_Finding or Cancer_Surgery). + +## Predicted Entities + +`is_location_of`, `O` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/RE_ONCOLOGY/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/redl_oncology_location_biobert_wip_en_4.2.4_3.0_1673770597615.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + +Use relation pairs to include only the combinations of entities that are relevant in your case. + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverterInternal() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +pos_tagger = PerceptronModel.pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("pos_tags") + +dependency_parser = DependencyParserModel.pretrained("dependency_conllu", "en") \ + .setInputCols(["sentence", "pos_tags", "token"]) \ + .setOutputCol("dependencies") + +re_ner_chunk_filter = RENerChunksFilter()\ + .setInputCols(["ner_chunk", "dependencies"])\ + .setOutputCol("re_ner_chunk")\ + .setMaxSyntacticDistance(10)\ + .setRelationPairs(["Tumor_Finding-Site_Breast", "Site_Breast-Tumor_Finding", "Tumor_Finding-Anatomical_Site", "Anatomical_Site-Tumor_Finding"]) + +re_model = RelationExtractionDLModel.pretrained("redl_oncology_location_biobert_wip", "en", "clinical/models")\ + .setInputCols(["re_ner_chunk", "sentence"])\ + .setOutputCol("relation_extraction") + +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter, + pos_tagger, + dependency_parser, + re_ner_chunk_filter, + re_model]) + +data = spark.createDataFrame([["In April 2011, she first noticed a lump in her right breast."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols(Array("document")) + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols(Array("sentence")) + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pos_tagger = PerceptronModel.pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("pos_tags") + +val dependency_parser = DependencyParserModel.pretrained("dependency_conllu", "en") + .setInputCols(Array("sentence", "pos_tags", "token")) + .setOutputCol("dependencies") + +val re_ner_chunk_filter = new RENerChunksFilter() + .setInputCols(Array("ner_chunk", "dependencies")) + .setOutputCol("re_ner_chunk") + .setMaxSyntacticDistance(10) + .setRelationPairs(Array("Tumor_Finding-Site_Breast", "Site_Breast-Tumor_Finding","Tumor_Finding-Anatomical_Site", "Anatomical_Site-Tumor_Finding")) + +val re_model = RelationExtractionDLModel.pretrained("redl_oncology_location_biobert_wip", "en", "clinical/models") + .setPredictionThreshold(0.5f) + .setInputCols(Array("re_ner_chunk", "sentence")) + .setOutputCol("relation_extraction") + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter, + pos_tagger, + dependency_parser, + re_ner_chunk_filter, + re_model)) + +val data = Seq("""In April 2011, she first noticed a lump in her right breast.""").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++--------------+-------------+-------------+-----------+------+-----------+-------------+-----------+------+----------+ +| relation| entity1|entity1_begin|entity1_end|chunk1| entity2|entity2_begin|entity2_end|chunk2|confidence| ++--------------+-------------+-------------+-----------+------+-----------+-------------+-----------+------+----------+ +|is_location_of|Tumor_Finding| 35| 38| lump|Site_Breast| 53| 58|breast| 0.9628376| ++--------------+-------------+-------------+-----------+------+-----------+-------------+-----------+------+----------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|redl_oncology_location_biobert_wip| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|401.7 MB| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label recall precision f1 + O 0.90 0.94 0.92 +is_location_of 0.94 0.90 0.92 + macro-avg 0.92 0.92 0.92 +``` \ No newline at end of file diff --git a/docs/_posts/Meryem1425/2023-01-15-redl_oncology_size_biobert_wip_en.md b/docs/_posts/Meryem1425/2023-01-15-redl_oncology_size_biobert_wip_en.md new file mode 100644 index 00000000000000..860085ad2178b9 --- /dev/null +++ b/docs/_posts/Meryem1425/2023-01-15-redl_oncology_size_biobert_wip_en.md @@ -0,0 +1,192 @@ +--- +layout: model +title: Relation Extraction between Tumors and Sizes (ReDL) +author: John Snow Labs +name: redl_oncology_size_biobert_wip +date: 2023-01-15 +tags: [licensed, clinical, oncology, en, relation_extraction, tensorflow] +task: Relation Extraction +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This relation extraction model links Tumor_Size extractions to their corresponding Tumor_Finding extractions. + +## Predicted Entities + +`is_size_of`, `O` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/RE_ONCOLOGY/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/redl_oncology_size_biobert_wip_en_4.2.4_3.0_1673772352847.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + +Tumor_Finding and Tumor_Size should be included in the relation pairs. + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverterInternal() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +pos_tagger = PerceptronModel.pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("pos_tags") + +dependency_parser = DependencyParserModel.pretrained("dependency_conllu", "en") \ + .setInputCols(["sentence", "pos_tags", "token"]) \ + .setOutputCol("dependencies") + +re_ner_chunk_filter = RENerChunksFilter()\ + .setInputCols(["ner_chunk", "dependencies"])\ + .setOutputCol("re_ner_chunk")\ + .setMaxSyntacticDistance(10)\ + .setRelationPairs(["Tumor_Finding-Tumor_Size", "Tumor_Size-Tumor_Finding"]) + +re_model = RelationExtractionDLModel.pretrained("redl_oncology_size_biobert_wip", "en", "clinical/models")\ + .setInputCols(["re_ner_chunk", "sentence"])\ + .setOutputCol("relation_extraction") + +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter, + pos_tagger, + dependency_parser, + re_ner_chunk_filter, + re_model]) + +data = spark.createDataFrame([["The patient presented a 2 cm mass in her left breast, and the tumor in her other breast was 3 cm long."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols(Array("document")) + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols(Array("sentence")) + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pos_tagger = PerceptronModel.pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("pos_tags") + +val dependency_parser = DependencyParserModel.pretrained("dependency_conllu", "en") + .setInputCols(Array("sentence", "pos_tags", "token")) + .setOutputCol("dependencies") + +val re_ner_chunk_filter = new RENerChunksFilter() + .setInputCols(Array("ner_chunk", "dependencies")) + .setOutputCol("re_ner_chunk") + .setMaxSyntacticDistance(10) + .setRelationPairs(Array("Tumor_Finding-Tumor_Size", "Tumor_Size-Tumor_Finding")) + +val re_model = RelationExtractionDLModel.pretrained("redl_oncology_size_biobert_wip", "en", "clinical/models") + .setPredictionThreshold(0.5f) + .setInputCols(Array("re_ner_chunk", "sentence")) + .setOutputCol("relation_extraction") + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter, + pos_tagger, + dependency_parser, + re_ner_chunk_filter, + re_model)) + +val data = Seq("The patient presented a 2 cm mass in her left breast, and the tumor in her other breast was 3 cm long.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++----------+-------------+-------------+-----------+------+-------------+-------------+-----------+------+----------+ +| relation| entity1|entity1_begin|entity1_end|chunk1| entity2|entity2_begin|entity2_end|chunk2|confidence| ++----------+-------------+-------------+-----------+------+-------------+-------------+-----------+------+----------+ +|is_size_of| Tumor_Size| 24| 27| 2 cm|Tumor_Finding| 29| 32| mass| 0.9604708| +|is_size_of|Tumor_Finding| 62| 66| tumor| Tumor_Size| 92| 95| 3 cm|0.99731797| ++----------+-------------+-------------+-----------+------+-------------+-------------+-----------+------+----------+ + +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|redl_oncology_size_biobert_wip| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|401.7 MB| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label recall precision f1 support + O 0.87 0.84 0.86 143.0 +is_size_of 0.85 0.88 0.86 157.0 + macro-avg 0.86 0.86 0.86 - +``` diff --git a/docs/_posts/Meryem1425/2023-01-15-redl_oncology_temporal_biobert_wip_en.md b/docs/_posts/Meryem1425/2023-01-15-redl_oncology_temporal_biobert_wip_en.md new file mode 100644 index 00000000000000..1165169710fe48 --- /dev/null +++ b/docs/_posts/Meryem1425/2023-01-15-redl_oncology_temporal_biobert_wip_en.md @@ -0,0 +1,191 @@ +--- +layout: model +title: Relation Extraction between dates and other entities (ReDL) +author: John Snow Labs +name: redl_oncology_temporal_biobert_wip +date: 2023-01-15 +tags: [licensed, clinical, oncology, en, relation_extraction, temporal, tensorflow] +task: Relation Extraction +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This relation extraction model links Date and Relative_Date extractions to clinical entities such as Test or Cancer_Dx. + +## Predicted Entities + +`is_date_of`, `O` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/RE_ONCOLOGY/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/redl_oncology_temporal_biobert_wip_en_4.2.4_3.0_1673774363542.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + +Each relevant relation pair in the pipeline should include one date entity (Date or Relative_Date) and a clinical entity (such as Pathology_Test, Cancer_Dx or Chemotherapy). + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverterInternal() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +pos_tagger = PerceptronModel.pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("pos_tags") + +dependency_parser = DependencyParserModel.pretrained("dependency_conllu", "en") \ + .setInputCols(["sentence", "pos_tags", "token"]) \ + .setOutputCol("dependencies") + +re_ner_chunk_filter = RENerChunksFilter()\ + .setInputCols(["ner_chunk", "dependencies"])\ + .setOutputCol("re_ner_chunk")\ + .setMaxSyntacticDistance(10)\ + .setRelationPairs(["Cancer_Dx-Date", "Date-Cancer_Dx", "Relative_Date-Cancer_Dx", "Cancer_Dx-Relative_Date", "Cancer_Surgery-Date", "Date-Cancer_Surgery", "Cancer_Surgery-Relative_Date", "Relative_Date-Cancer_Surgery"]) + +re_model = RelationExtractionDLModel.pretrained("redl_oncology_temporal_biobert_wip", "en", "clinical/models")\ + .setInputCols(["re_ner_chunk", "sentence"])\ + .setOutputCol("relation_extraction") + +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter, + pos_tagger, + dependency_parser, + re_ner_chunk_filter, + re_model]) + +data = spark.createDataFrame([["Her breast cancer was diagnosed last year."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pos_tagger = PerceptronModel.pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("pos_tags") + +val dependency_parser = DependencyParserModel.pretrained("dependency_conllu", "en") + .setInputCols(Array("sentence", "pos_tags", "token")) + .setOutputCol("dependencies") + +val re_ner_chunk_filter = new RENerChunksFilter() + .setInputCols(Array("ner_chunk", "dependencies")) + .setOutputCol("re_ner_chunk") + .setMaxSyntacticDistance(10) + .setRelationPairs(Array("Cancer_Dx-Date", "Date-Cancer_Dx", "Relative_Date-Cancer_Dx", "Cancer_Dx-Relative_Date", "Cancer_Surgery-Date", "Date-Cancer_Surgery", "Cancer_Surgery-Relative_Date", "Relative_Date-Cancer_Surgery")) + +val re_model = RelationExtractionDLModel.pretrained("redl_oncology_temporal_biobert_wip", "en", "clinical/models") + .setPredictionThreshold(0.5f) + .setInputCols(Array("re_ner_chunk", "sentence")) + .setOutputCol("relation_extraction") + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter, + pos_tagger, + dependency_parser, + re_ner_chunk_filter, + re_model)) + +val data = Seq("Her breast cancer was diagnosed last year.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) + +``` +
+ +## Results + +```bash ++----------+---------+-------------+-----------+-------------+-------------+-------------+-----------+---------+----------+ +| relation| entity1|entity1_begin|entity1_end| chunk1| entity2|entity2_begin|entity2_end| chunk2|confidence| ++----------+---------+-------------+-----------+-------------+-------------+-------------+-----------+---------+----------+ +|is_date_of|Cancer_Dx| 4| 16|breast cancer|Relative_Date| 32| 40|last year| 0.9999256| ++----------+---------+-------------+-----------+-------------+-------------+-------------+-----------+---------+----------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|redl_oncology_temporal_biobert_wip| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|401.7 MB| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label recall precision f1 support + O 0.77 0.81 0.79 302.0 +is_date_of 0.82 0.78 0.80 298.0 + macro-avg 0.79 0.79 0.79 - +``` \ No newline at end of file diff --git a/docs/_posts/Meryem1425/2023-01-15-redl_oncology_test_result_biobert_wip_en.md b/docs/_posts/Meryem1425/2023-01-15-redl_oncology_test_result_biobert_wip_en.md new file mode 100644 index 00000000000000..29b83bd58ef7be --- /dev/null +++ b/docs/_posts/Meryem1425/2023-01-15-redl_oncology_test_result_biobert_wip_en.md @@ -0,0 +1,192 @@ +--- +layout: model +title: Relation Extraction between Test and Results (ReDL) +author: John Snow Labs +name: redl_oncology_test_result_biobert_wip +date: 2023-01-15 +tags: [licensed, clinical, oncology, en, relation_extraction, test, tensorflow] +task: Relation Extraction +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This relation extraction model links test extractions to their corresponding results. + +## Predicted Entities + +`is_finding_of`, `O` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/RE_ONCOLOGY/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/redl_oncology_test_result_biobert_wip_en_4.2.4_3.0_1673776756086.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + +Each relevant relation pair in the pipeline should include one test entity (such as Biomarker, Imaging_Test, Pathology_Test or Oncogene) and one result entity (such as Biomarker_Result, Pathology_Result or Tumor_Finding). + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverterInternal() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +pos_tagger = PerceptronModel.pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("pos_tags") + +dependency_parser = DependencyParserModel.pretrained("dependency_conllu", "en") \ + .setInputCols(["sentence", "pos_tags", "token"]) \ + .setOutputCol("dependencies") + +re_ner_chunk_filter = RENerChunksFilter()\ + .setInputCols(["ner_chunk", "dependencies"])\ + .setOutputCol("re_ner_chunk")\ + .setMaxSyntacticDistance(10)\ + .setRelationPairs(["Biomarker-Biomarker_Result", "Biomarker_Result-Biomarker", "Oncogene-Biomarker_Result", "Biomarker_Result-Oncogene", "Pathology_Test-Pathology_Result", "Pathology_Result-Pathology_Test"]) + +re_model = RelationExtractionDLModel.pretrained("redl_oncology_test_result_biobert_wip", "en", "clinical/models")\ + .setInputCols(["re_ner_chunk", "sentence"])\ + .setOutputCol("relation_extraction") + +pipeline = Pipeline(stages=[document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter, + pos_tagger, + dependency_parser, + re_ner_chunk_filter, + re_model]) + +data = spark.createDataFrame([["Pathology showed tumor cells, which were positive for estrogen and progesterone receptors."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pos_tagger = PerceptronModel.pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("pos_tags") + +val dependency_parser = DependencyParserModel.pretrained("dependency_conllu", "en") + .setInputCols(Array("sentence", "pos_tags", "token")) + .setOutputCol("dependencies") + +val re_ner_chunk_filter = new RENerChunksFilter() + .setInputCols(Array("ner_chunk", "dependencies")) + .setOutputCol("re_ner_chunk") + .setMaxSyntacticDistance(10) + .setRelationPairs(Array("Biomarker-Biomarker_Result", "Biomarker_Result-Biomarker", "Oncogene-Biomarker_Result", "Biomarker_Result-Oncogene", "Pathology_Test-Pathology_Result", "Pathology_Result-Pathology_Test")) + +val re_model = RelationExtractionDLModel.pretrained("redl_oncology_test_result_biobert_wip", "en", "clinical/models") + .setPredictionThreshold(0.5f) + .setInputCols(Array("re_ner_chunk", "sentence")) + .setOutputCol("relation_extraction") + +val pipeline = new Pipeline().setStages(Array(document_assembler, + sentence_detector, + tokenizer, + word_embeddings, + ner, + ner_converter, + pos_tagger, + dependency_parser, + re_ner_chunk_filter, + re_model)) + +val data = Seq("Pathology showed tumor cells, which were positive for estrogen and progesterone receptors.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++-------------+----------------+-------------+-----------+---------+----------------+-------------+-----------+--------------------+----------+ +| relation| entity1|entity1_begin|entity1_end| chunk1| entity2|entity2_begin|entity2_end| chunk2|confidence| ++-------------+----------------+-------------+-----------+---------+----------------+-------------+-----------+--------------------+----------+ +|is_finding_of| Pathology_Test| 0| 8|Pathology|Pathology_Result| 17| 27| tumor cells| 0.8494344| +|is_finding_of|Biomarker_Result| 41| 48| positive| Biomarker| 54| 61| estrogen|0.99451536| +|is_finding_of|Biomarker_Result| 41| 48| positive| Biomarker| 67| 88|progesterone rece...|0.99218905| ++-------------+----------------+-------------+-----------+---------+----------------+-------------+-----------+--------------------+----------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|redl_oncology_test_result_biobert_wip| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|401.7 MB| + +## References + +In-house annotated oncology case reports. + +## Benchmarking + +```bash + label recall precision f1 + O 0.87 0.92 0.9 +is_finding_of 0.93 0.88 0.9 + macro-avg 0.90 0.90 0.9 +``` \ No newline at end of file diff --git a/docs/_posts/Meryem1425/2023-01-15-redl_temporal_events_biobert_en.md b/docs/_posts/Meryem1425/2023-01-15-redl_temporal_events_biobert_en.md new file mode 100644 index 00000000000000..c3098ae0d2d5c5 --- /dev/null +++ b/docs/_posts/Meryem1425/2023-01-15-redl_temporal_events_biobert_en.md @@ -0,0 +1,197 @@ +--- +layout: model +title: Extract temporal relations among clinical events (ReDL) +author: John Snow Labs +name: redl_temporal_events_biobert +date: 2023-01-15 +tags: [relation_extraction, en, clinical, licensed, tensorflow] +task: Relation Extraction +language: en +edition: Healthcare NLP 4.2.4 +spark_version: 3.0 +supported: true +engine: tensorflow +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Extract relations between clinical events in terms of time. If an event occurred before, after, or overlaps another event. + +## Predicted Entities + +`AFTER`, `BEFORE`, `OVERLAP` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/RE_CLINICAL_EVENTS/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.Clinical_Relation_Extraction.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/redl_temporal_events_biobert_en_4.2.4_3.0_1673778147598.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documenter = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentencer = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentences") + +tokenizer = sparknlp.annotators.Tokenizer()\ + .setInputCols(["sentences"])\ + .setOutputCol("tokens") + +pos_tagger = PerceptronModel()\ + .pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("pos_tags") + +words_embedder = WordEmbeddingsModel() \ + .pretrained("embeddings_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"]) \ + .setOutputCol("embeddings") + +ner_tagger = MedicalNerModel.pretrained("ner_events_clinical", "en", "clinical/models")\ + .setInputCols("sentences", "tokens", "embeddings")\ + .setOutputCol("ner_tags") + +ner_converter = NerConverterInternal() \ + .setInputCols(["sentences", "tokens", "ner_tags"]) \ + .setOutputCol("ner_chunks") + +dependency_parser = DependencyParserModel() \ + .pretrained("dependency_conllu", "en") \ + .setInputCols(["sentences", "pos_tags", "tokens"]) \ + .setOutputCol("dependencies") + +re_ner_chunk_filter = RENerChunksFilter() \ + .setInputCols(["ner_chunks", "dependencies"])\ + .setMaxSyntacticDistance(10)\ + .setOutputCol("re_ner_chunks") + +re_model = RelationExtractionDLModel()\ + .pretrained("redl_temporal_events_biobert", "en", "clinical/models") \ + .setPredictionThreshold(0.5)\ + .setInputCols(["re_ner_chunks", "sentences"]) \ + .setOutputCol("relations") + +pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, pos_tagger, words_embedder, ner_tagger, ner_converter, dependency_parser, re_ner_chunk_filter, re_model]) + +text = "She is diagnosed with cancer in 1991. Then she was admitted to Mayo Clinic in May 2000 and discharged in October 2001" + +data = spark.createDataFrame([[text]]).toDF("text") + +p_model = pipeline.fit(data) + +result = p_model.transform(data) +``` +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentencer = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentences") + +val tokenizer = new Tokenizer() + .setInputCols("sentences") + .setOutputCol("tokens") + +val pos_tagger = PerceptronModel() + .pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("pos_tags") + +val words_embedder = WordEmbeddingsModel() + .pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("embeddings") + +val ner_tagger = MedicalNerModel.pretrained("ner_events_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens", "embeddings")) + .setOutputCol("ner_tags") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentences", "tokens", "ner_tags")) + .setOutputCol("ner_chunks") + +val dependency_parser = DependencyParserModel() + .pretrained("dependency_conllu", "en") + .setInputCols(Array("sentences", "pos_tags", "tokens")) + .setOutputCol("dependencies") + +// Set a filter on pairs of named entities which will be treated as relation candidates +val re_ner_chunk_filter = new RENerChunksFilter() + .setInputCols(Array("ner_chunks", "dependencies")) + .setMaxSyntacticDistance(10) + .setOutputCol("re_ner_chunks") + +// The dataset this model is trained to is sentence-wise. +// This model can also be trained on document-level relations - in which case, while predicting, use "document" instead of "sentence" as input. +val re_model = RelationExtractionDLModel() + .pretrained("redl_temporal_events_biobert", "en", "clinical/models") + .setPredictionThreshold(0.5) + .setInputCols(Array("re_ner_chunks", "sentences")) + .setOutputCol("relations") + +val pipeline = new Pipeline().setStages(Array(documenter, sentencer, tokenizer, pos_tagger, words_embedder, ner_tagger, ner_converter, dependency_parser, re_ner_chunk_filter, re_model)) + +val data = Seq("""She is diagnosed with cancer in 1991. Then she was admitted to Mayo Clinic in May 2000 and discharged in October 2001""").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++--------+-------------+-------------+-----------+-----------+-------------+-------------+-----------+------------+----------+ +|relation| entity1|entity1_begin|entity1_end| chunk1| entity2|entity2_begin|entity2_end| chunk2|confidence| ++--------+-------------+-------------+-----------+-----------+-------------+-------------+-----------+------------+----------+ +| BEFORE| OCCURRENCE| 7| 15| diagnosed| PROBLEM| 22| 27| cancer|0.78168863| +| OVERLAP| PROBLEM| 22| 27| cancer| DATE| 32| 35| 1991| 0.8492274| +| AFTER| OCCURRENCE| 51| 58| admitted|CLINICAL_DEPT| 63| 73| Mayo Clinic|0.85629463| +| BEFORE| OCCURRENCE| 51| 58| admitted| OCCURRENCE| 91| 100| discharged| 0.6843513| +| OVERLAP|CLINICAL_DEPT| 63| 73|Mayo Clinic| DATE| 78| 85| May 2000| 0.7844673| +| BEFORE|CLINICAL_DEPT| 63| 73|Mayo Clinic| OCCURRENCE| 91| 100| discharged|0.60411876| +| OVERLAP|CLINICAL_DEPT| 63| 73|Mayo Clinic| DATE| 105| 116|October 2001| 0.540761| +| BEFORE| DATE| 78| 85| May 2000| OCCURRENCE| 91| 100| discharged| 0.6042761| +| OVERLAP| DATE| 78| 85| May 2000| DATE| 105| 116|October 2001|0.64867175| +| BEFORE| OCCURRENCE| 91| 100| discharged| DATE| 105| 116|October 2001| 0.5302478| ++--------+-------------+-------------+-----------+-----------+-------------+-------------+-----------+------------+----------+ + +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|redl_temporal_events_biobert| +|Compatibility:|Healthcare NLP 4.2.4+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| +|Size:|401.7 MB| + +## References + +Trained on temporal clinical events benchmark dataset. + +## Benchmarking + +```bash +label Recall Precision F1 Support +AFTER 0.332 0.655 0.440 2123 +BEFORE 0.868 0.908 0.887 13817 +OVERLAP 0.887 0.733 0.802 7860 +Avg. 0.695 0.765 0.710 - +``` \ No newline at end of file From 9e2dbd24efaad15d7774b1734d780e2e05e34d33 Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Thu, 26 Jan 2023 14:22:02 +0700 Subject: [PATCH 52/57] 2023-01-25-ner_eu_clinical_case_en (#13415) --- .../2023-01-25-ner_eu_clinical_case_en.md | 180 ++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100644 docs/_posts/gpirge/2023-01-25-ner_eu_clinical_case_en.md diff --git a/docs/_posts/gpirge/2023-01-25-ner_eu_clinical_case_en.md b/docs/_posts/gpirge/2023-01-25-ner_eu_clinical_case_en.md new file mode 100644 index 00000000000000..0adc7f6307b107 --- /dev/null +++ b/docs/_posts/gpirge/2023-01-25-ner_eu_clinical_case_en.md @@ -0,0 +1,180 @@ +--- +layout: model +title: Detect Clinical Entities (ner_eu_clinical_case) +author: John Snow Labs +name: ner_eu_clinical_case +date: 2023-01-25 +tags: [clinical, licensed, ner, en] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.7 +spark_version: 3.0 +supported: true +annotator: MedicalNerModel +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained named entity recognition (NER) deep learning model for clinical entities. The SparkNLP deep learning model (MedicalNerModel) is inspired by a former state of the art model for NER: Chiu & Nicols, Named Entity Recognition with Bidirectional LSTM-CNN. + +The corpus used for model training is provided by European Clinical Case Corpus (E3C), a project aimed at offering a freely available multilingual corpus of semantically annotated clinical narratives. + +## Predicted Entities + +`clinical_event`, `bodypart`, `clinical_condition`, `units_measurements`, `patient`, `date_time` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_eu_clinical_case_en_4.2.7_3.2_1674657662344.zip){:.button.button-orange} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/clinical/models/ner_eu_clinical_case_en_4.2.7_3.2_1674657662344.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentenceDetectorDL = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "en")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")\ + .setInputCols(["sentence","token"])\ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained('ner_eu_clinical_case', "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter()\ + .setInputCols(["sentence", "token", "ner"])\ + .setOutputCol("ner_chunk") + +pipeline = pipeline(stages=[ + document_assembler, + sentenceDetectorDL, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["""A 3-year-old boy with autistic disorder on hospital of pediatric ward A at university hospital. He has no family history of illness or autistic spectrum disorder. The child was diagnosed with a severe communication disorder, with social interaction difficulties and sensory processing delay. Blood work was normal (thyroid-stimulating hormone (TSH), hemoglobin, mean corpuscular volume (MCV), and ferritin). Upper endoscopy also showed a submucosal tumor causing subtotal obstruction of the gastric outlet. Because a gastrointestinal stromal tumor was suspected, distal gastrectomy was performed. Histopathological examination revealed spindle cell proliferation in the submucosal layer."""]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentenceDetector = SentenceDetectorDLModel.pretrained() + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models") + .setInputCols(Array("sentence","token")) + .setOutputCol("embeddings") + +val ner_model = MedicalNerModel.pretrained("ner_eu_clinical_case", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pipeline = new Pipeline().setStages(Array(documenter, sentenceDetector, tokenizer, word_embeddings, ner_model, ner_converter)) + +val data = Seq(Array("""A 3-year-old boy with autistic disorder on hospital of pediatric ward A at university hospital. He has no family history of illness or autistic spectrum disorder. The child was diagnosed with a severe communication disorder, with social interaction difficulties and sensory processing delay. Blood work was normal (thyroid-stimulating hormone (TSH), hemoglobin, mean corpuscular volume (MCV), and ferritin). Upper endoscopy also showed a submucosal tumor causing subtotal obstruction of the gastric outlet. Because a gastrointestinal stromal tumor was suspected, distal gastrectomy was performed. Histopathological examination revealed spindle cell proliferation in the submucosal layer.""")).toDS().toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++------------------------------+------------------+ +|chunk |ner_label | ++------------------------------+------------------+ +|A 3-year-old boy |patient | +|autistic disorder |clinical_condition| +|He |patient | +|illness |clinical_event | +|autistic spectrum disorder |clinical_condition| +|The child |patient | +|diagnosed |clinical_event | +|disorder |clinical_event | +|difficulties |clinical_event | +|Blood |bodypart | +|work |clinical_event | +|normal |units_measurements| +|hormone |clinical_event | +|hemoglobin |clinical_event | +|volume |clinical_event | +|endoscopy |clinical_event | +|showed |clinical_event | +|tumor |clinical_condition| +|causing |clinical_event | +|obstruction |clinical_event | +|the gastric outlet |bodypart | +|gastrointestinal stromal tumor|clinical_condition| +|suspected |clinical_event | +|gastrectomy |clinical_event | +|examination |clinical_event | +|revealed |clinical_event | +|spindle cell proliferation |clinical_condition| +|the submucosal layer |bodypart | ++------------------------------+------------------+ + + +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_eu_clinical_case| +|Compatibility:|Healthcare NLP 4.2.7+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|849.0 KB| + +## References + +The corpus used for model training is provided by European Clinical Case Corpus (E3C), a project aimed at offering a freely available multilingual corpus of semantically annotated clinical narratives. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + date_time 54.0 7.0 15.0 69.0 0.8852 0.7826 0.8308 +units_measurements 111.0 48.0 12.0 123.0 0.6981 0.9024 0.7872 +clinical_condition 93.0 47.0 81.0 174.0 0.6643 0.5345 0.5924 + patient 119.0 16.0 5.0 124.0 0.8815 0.9597 0.9189 + clinical_event 331.0 126.0 89.0 420.0 0.7243 0.7881 0.7548 + bodypart 171.0 58.0 84.0 255.0 0.7467 0.6706 0.7066 + macro - - - - - - 0.7651 + micro - - - - - - 0.7454 +``` From 151529877dc7dd0fab543bf120465945c035a8b0 Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Wed, 8 Feb 2023 00:55:43 +0700 Subject: [PATCH 53/57] 2023-02-01-ner_eu_clinical_case_es (#13454) --- .../2023-02-01-ner_eu_clinical_case_es.md | 189 +++++++++++++++++ .../2023-02-01-ner_eu_clinical_case_fr.md | 182 ++++++++++++++++ .../2023-02-02-ner_eu_clinical_case_eu.md | 194 ++++++++++++++++++ ...2023-02-06-ner_eu_clinical_condition_en.md | 153 ++++++++++++++ ...2023-02-06-ner_eu_clinical_condition_es.md | 149 ++++++++++++++ ...2023-02-06-ner_eu_clinical_condition_eu.md | 153 ++++++++++++++ ...2023-02-06-ner_eu_clinical_condition_fr.md | 152 ++++++++++++++ ...2023-02-06-ner_eu_clinical_condition_it.md | 155 ++++++++++++++ 8 files changed, 1327 insertions(+) create mode 100644 docs/_posts/gpirge/2023-02-01-ner_eu_clinical_case_es.md create mode 100644 docs/_posts/gpirge/2023-02-01-ner_eu_clinical_case_fr.md create mode 100644 docs/_posts/gpirge/2023-02-02-ner_eu_clinical_case_eu.md create mode 100644 docs/_posts/gpirge/2023-02-06-ner_eu_clinical_condition_en.md create mode 100644 docs/_posts/gpirge/2023-02-06-ner_eu_clinical_condition_es.md create mode 100644 docs/_posts/gpirge/2023-02-06-ner_eu_clinical_condition_eu.md create mode 100644 docs/_posts/gpirge/2023-02-06-ner_eu_clinical_condition_fr.md create mode 100644 docs/_posts/gpirge/2023-02-06-ner_eu_clinical_condition_it.md diff --git a/docs/_posts/gpirge/2023-02-01-ner_eu_clinical_case_es.md b/docs/_posts/gpirge/2023-02-01-ner_eu_clinical_case_es.md new file mode 100644 index 00000000000000..04cc1cbc2853e5 --- /dev/null +++ b/docs/_posts/gpirge/2023-02-01-ner_eu_clinical_case_es.md @@ -0,0 +1,189 @@ +--- +layout: model +title: Detect Clinical Entities (ner_eu_clinical_case - es) +author: John Snow Labs +name: ner_eu_clinical_case +date: 2023-02-01 +tags: [es, clinical, licensed, ner] +task: Named Entity Recognition +language: es +edition: Healthcare NLP 4.2.8 +spark_version: 3.0 +supported: true +annotator: MedicalNerModel +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained named entity recognition (NER) deep learning model for extracting clinical entities from Spanish texts. The SparkNLP deep learning model (MedicalNerModel) is inspired by a former state of the art model for NER: Chiu & Nichols, Named Entity Recognition with Bidirectional LSTM-CNN. + +The corpus used for model training is provided by European Clinical Case Corpus (E3C), a project aimed at offering a freely available multilingual corpus of semantically annotated clinical narratives. + +## Predicted Entities + +`clinical_event`, `bodypart`, `clinical_condition`, `units_measurements`, `patient`, `date_time` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_eu_clinical_case_es_4.2.8_3.0_1675285093855.zip){:.button.button-orange} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/clinical/models/ner_eu_clinical_case_es_4.2.8_3.0_1675285093855.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentenceDetectorDL = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel.pretrained("w2v_cc_300d","es")\ + .setInputCols(["sentence","token"])\ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained("ner_eu_clinical_case", "es", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverterInternal()\ + .setInputCols(["sentence", "token", "ner"])\ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[ + document_assembler, + sentenceDetectorDL, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["""Un niño de 3 años con trastorno autista en el hospital de la sala pediátrica A del hospital universitario. No tiene antecedentes familiares de enfermedad o trastorno del espectro autista. El niño fue diagnosticado con un trastorno de comunicación severo, con dificultades de interacción social y retraso en el procesamiento sensorial. Los análisis de sangre fueron normales (hormona estimulante de la tiroides (TSH), hemoglobina, volumen corpuscular medio (MCV) y ferritina). La endoscopia alta también mostró un tumor submucoso que causaba una obstrucción subtotal de la salida gástrica. Ante la sospecha de tumor del estroma gastrointestinal, se realizó gastrectomía distal. El examen histopatológico reveló proliferación de células fusiformes en la capa submucosa."""]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentenceDetectorDL = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("w2v_cc_300d","es") + .setInputCols(Array("sentence","token")) + .setOutputCol("embeddings") + +val ner = MedicalNerModel.pretrained("ner_eu_clinical_case", "es", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pipeline = new Pipeline().setStages(Array( + document_assembler, + sentenceDetectorDL, + tokenizer, + word_embeddings, + ner, + ner_converter)) + +val data = Seq("""Un niño de 3 años con trastorno autista en el hospital de la sala pediátrica A del hospital universitario. No tiene antecedentes familiares de enfermedad o trastorno del espectro autista. El niño fue diagnosticado con un trastorno de comunicación severo, con dificultades de interacción social y retraso en el procesamiento sensorial. Los análisis de sangre fueron normales (hormona estimulante de la tiroides (TSH), hemoglobina, volumen corpuscular medio (MCV) y ferritina). La endoscopia alta también mostró un tumor submucoso que causaba una obstrucción subtotal de la salida gástrica. Ante la sospecha de tumor del estroma gastrointestinal, se realizó gastrectomía distal. El examen histopatológico reveló proliferación de células fusiformes en la capa submucosa.""").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++--------------------------------+------------------+ +|chunk |ner_label | ++--------------------------------+------------------+ +|Un niño de 3 años |patient | +|trastorno autista |clinical_event | +|antecedentes |clinical_event | +|enfermedad |clinical_event | +|trastorno del espectro autista |clinical_event | +|El niño |patient | +|diagnosticado |clinical_event | +|trastorno de comunicación severo|clinical_event | +|dificultades |clinical_event | +|retraso |clinical_event | +|análisis |clinical_event | +|sangre |bodypart | +|normales |units_measurements| +|hormona |clinical_event | +|la tiroides |bodypart | +|TSH |clinical_event | +|hemoglobina |clinical_event | +|volumen |clinical_event | +|MCV |clinical_event | +|ferritina |clinical_event | +|endoscopia |clinical_event | +|mostró |clinical_event | +|tumor submucoso |clinical_event | +|obstrucción |clinical_event | +|tumor |clinical_event | +|del estroma gastrointestinal |bodypart | +|gastrectomía |clinical_event | +|examen |clinical_event | +|reveló |clinical_event | +|proliferación |clinical_event | +|células fusiformes |bodypart | +|la capa submucosa |bodypart | ++--------------------------------+------------------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_eu_clinical_case| +|Compatibility:|Healthcare NLP 4.2.8+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|es| +|Size:|895.1 KB| + +## References + +The corpus used for model training is provided by European Clinical Case Corpus (E3C), a project aimed at offering a freely available multilingual corpus of semantically annotated clinical narratives. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + date_time 87.0 10.0 17.0 104.0 0.8969 0.8365 0.8657 +units_measurements 37.0 5.0 11.0 48.0 0.8810 0.7708 0.8222 +clinical_condition 50.0 34.0 70.0 120.0 0.5952 0.4167 0.4902 + patient 76.0 8.0 11.0 87.0 0.9048 0.8736 0.8889 + clinical_event 399.0 44.0 79.0 478.0 0.9007 0.8347 0.8664 + bodypart 153.0 56.0 13.0 166.0 0.7321 0.9217 0.8160 + macro - - - - - - 0.7916 + micro - - - - - - 0.8128 +``` diff --git a/docs/_posts/gpirge/2023-02-01-ner_eu_clinical_case_fr.md b/docs/_posts/gpirge/2023-02-01-ner_eu_clinical_case_fr.md new file mode 100644 index 00000000000000..7e21761fdf5af8 --- /dev/null +++ b/docs/_posts/gpirge/2023-02-01-ner_eu_clinical_case_fr.md @@ -0,0 +1,182 @@ +--- +layout: model +title: Detect Clinical Entities (ner_eu_clinical_case - fr) +author: John Snow Labs +name: ner_eu_clinical_case +date: 2023-02-01 +tags: [fr, clinical, licensed, ner] +task: Named Entity Recognition +language: fr +edition: Healthcare NLP 4.2.8 +spark_version: 3.0 +supported: true +annotator: MedicalNerModel +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained named entity recognition (NER) deep learning model for extracting clinical entities from French texts. The SparkNLP deep learning model (MedicalNerModel) is inspired by a former state of the art model for NER: Chiu & Nichols, Named Entity Recognition with Bidirectional LSTM-CNN. + +The corpus used for model training is provided by European Clinical Case Corpus (E3C), a project aimed at offering a freely available multilingual corpus of semantically annotated clinical narratives. + +## Predicted Entities + +`clinical_event`, `bodypart`, `clinical_condition`, `units_measurements`, `patient`, `date_time` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_eu_clinical_case_fr_4.2.8_3.0_1675293960896.zip){:.button.button-orange} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/clinical/models/ner_eu_clinical_case_fr_4.2.8_3.0_1675293960896.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentenceDetectorDL = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel.pretrained("w2v_cc_300d","fr")\ + .setInputCols(["sentence","token"])\ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained('ner_eu_clinical_case', "fr", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverterInternal()\ + .setInputCols(["sentence", "token", "ner"])\ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[ + document_assembler, + sentenceDetectorDL, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["""Un garçon de 3 ans atteint d'un trouble autistique à l'hôpital du service pédiatrique A de l'hôpital universitaire. Il n'a pas d'antécédents familiaux de troubles ou de maladies du spectre autistique. Le garçon a été diagnostiqué avec un trouble de communication sévère, avec des difficultés d'interaction sociale et un traitement sensoriel retardé. Les tests sanguins étaient normaux (thyréostimuline (TSH), hémoglobine, volume globulaire moyen (MCV) et ferritine). L'endoscopie haute a également montré une tumeur sous-muqueuse provoquant une obstruction subtotale de la sortie gastrique. Devant la suspicion d'une tumeur stromale gastro-intestinale, une gastrectomie distale a été réalisée. L'examen histopathologique a révélé une prolifération de cellules fusiformes dans la couche sous-muqueuse."""]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("w2v_cc_300d","fr") + .setInputCols(Array("sentence","token")) + .setOutputCol("embeddings") + +val ner_model = MedicalNerModel.pretrained("ner_eu_clinical_case", "fr", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pipeline = new Pipeline().setStages(Array(documenter, sentenceDetector, tokenizer, word_embeddings, ner_model, ner_converter)) + +val data = Seq(Array("""Un garçon de 3 ans atteint d'un trouble autistique à l'hôpital du service pédiatrique A de l'hôpital universitaire. Il n'a pas d'antécédents familiaux de troubles ou de maladies du spectre autistique. Le garçon a été diagnostiqué avec un trouble de communication sévère, avec des difficultés d'interaction sociale et un traitement sensoriel retardé. Les tests sanguins étaient normaux (thyréostimuline (TSH), hémoglobine, volume globulaire moyen (MCV) et ferritine). L'endoscopie haute a également montré une tumeur sous-muqueuse provoquant une obstruction subtotale de la sortie gastrique. Devant la suspicion d'une tumeur stromale gastro-intestinale, une gastrectomie distale a été réalisée. L'examen histopathologique a révélé une prolifération de cellules fusiformes dans la couche sous-muqueuse.""")).toDS().toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++-----------------------------------------------------+------------------+ +|chunk |ner_label | ++-----------------------------------------------------+------------------+ +|Un garçon de 3 ans |patient | +|trouble autistique à l'hôpital du service pédiatrique|clinical_condition| +|l'hôpital |clinical_event | +|Il n'a |patient | +|d'antécédents |clinical_event | +|troubles |clinical_condition| +|maladies |clinical_condition| +|du spectre autistique |bodypart | +|Le garçon |patient | +|diagnostiqué |clinical_event | +|trouble |clinical_condition| +|difficultés |clinical_event | +|traitement |clinical_event | +|tests |clinical_event | +|normaux |units_measurements| +|thyréostimuline |clinical_event | +|TSH |clinical_event | +|ferritine |clinical_event | +|L'endoscopie |clinical_event | +|montré |clinical_event | +|tumeur sous-muqueuse |clinical_condition| +|provoquant |clinical_event | +|obstruction |clinical_condition| +|la sortie gastrique |bodypart | +|suspicion |clinical_event | +|tumeur stromale gastro-intestinale |clinical_condition| +|gastrectomie |clinical_event | +|L'examen |clinical_event | +|révélé |clinical_event | +|prolifération |clinical_event | +|cellules fusiformes |bodypart | +|la couche sous-muqueuse |bodypart | ++-----------------------------------------------------+------------------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_eu_clinical_case| +|Compatibility:|Healthcare NLP 4.2.8+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|fr| +|Size:|895.0 KB| + +## References + +The corpus used for model training is provided by European Clinical Case Corpus (E3C), a project aimed at offering a freely available multilingual corpus of semantically annotated clinical narratives. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + date_time 49.0 14.0 70.0 104.0 0.7778 0.7000 0.7368 +units_measurements 92.0 19.0 6.0 48.0 0.8288 0.9388 0.8804 +clinical_condition 178.0 74.0 73.0 120.0 0.7063 0.7092 0.7078 + patient 114.0 6.0 15.0 87.0 0.9500 0.8837 0.9157 + clinical_event 265.0 81.0 71.0 478.0 0.7659 0.7887 0.7771 + bodypart 243.0 34.0 64.0 166.0 0.8773 0.7915 0.8322 + macro - - - - - - 0.8083 + micro - - - - - - 0.7978 +``` diff --git a/docs/_posts/gpirge/2023-02-02-ner_eu_clinical_case_eu.md b/docs/_posts/gpirge/2023-02-02-ner_eu_clinical_case_eu.md new file mode 100644 index 00000000000000..4c98890d5cecf6 --- /dev/null +++ b/docs/_posts/gpirge/2023-02-02-ner_eu_clinical_case_eu.md @@ -0,0 +1,194 @@ +--- +layout: model +title: Detect Clinical Entities (ner_eu_clinical_case - eu) +author: John Snow Labs +name: ner_eu_clinical_case +date: 2023-02-02 +tags: [eu, clinical, licensed, ner] +task: Named Entity Recognition +language: eu +edition: Healthcare NLP 4.2.8 +spark_version: 3.0 +supported: true +annotator: MedicalNerModel +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained named entity recognition (NER) deep learning model for extracting clinical entities from Basque texts. The SparkNLP deep learning model (MedicalNerModel) is inspired by a former state of the art model for NER: Chiu & Nicols, Named Entity Recognition with Bidirectional LSTM-CNN. + +The corpus used for model training is provided by European Clinical Case Corpus (E3C), a project aimed at offering a freely available multilingual corpus of semantically annotated clinical narratives. + +## Predicted Entities + +`clinical_event`, `bodypart`, `clinical_condition`, `units_measurements`, `patient`, `date_time` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_eu_clinical_case_eu_4.2.8_3.0_1675359410041.zip){:.button.button-orange} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/clinical/models/ner_eu_clinical_case_eu_4.2.8_3.0_1675359410041.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentenceDetectorDL = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel.pretrained("w2v_cc_300d","eu")\ + .setInputCols(["sentence","token"])\ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained('ner_eu_clinical_case', "eu", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverterInternal()\ + .setInputCols(["sentence", "token", "ner"])\ + .setOutputCol("ner_chunk") + +pipeline = pipeline(stages=[ + document_assembler, + sentenceDetectorDL, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["""3 urteko mutiko bat nahasmendu autistarekin unibertsitateko ospitaleko A pediatriako ospitalean. Ez du autismoaren espektroaren nahaste edo gaixotasun familiaren aurrekaririk. Mutilari komunikazio-nahaste larria diagnostikatu zioten, elkarrekintza sozialeko zailtasunak eta prozesamendu sentsorial atzeratua. Odol-analisiak normalak izan ziren (tiroidearen hormona estimulatzailea (TSH), hemoglobina, batez besteko bolumen corpuskularra (MCV) eta ferritina). Goiko endoskopiak mukosaren azpiko tumore bat ere erakutsi zuen, urdail-irteeren guztizko oztopoa eragiten zuena. Estroma gastrointestinalaren tumore baten susmoa ikusita, distaleko gastrektomia egin zen. Azterketa histopatologikoak agerian utzi zuen mukosaren azpiko zelulen ugaltzea."""]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("w2v_cc_300d","eu") + .setInputCols(Array("sentence","token")) + .setOutputCol("embeddings") + +val ner_model = MedicalNerModel.pretrained("ner_eu_clinical_case", "eu", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pipeline = new Pipeline().setStages(Array(documenter, sentenceDetector, tokenizer, word_embeddings, ner_model, ner_converter)) + +val data = Seq(Array("""3 urteko mutiko bat nahasmendu autistarekin unibertsitateko ospitaleko A pediatriako ospitalean. Ez du autismoaren espektroaren nahaste edo gaixotasun familiaren aurrekaririk. Mutilari komunikazio-nahaste larria diagnostikatu zioten, elkarrekintza sozialeko zailtasunak eta prozesamendu sentsorial atzeratua. Odol-analisiak normalak izan ziren (tiroidearen hormona estimulatzailea (TSH), hemoglobina, batez besteko bolumen corpuskularra (MCV) eta ferritina). Goiko endoskopiak mukosaren azpiko tumore bat ere erakutsi zuen, urdail-irteeren guztizko oztopoa eragiten zuena. Estroma gastrointestinalaren tumore baten susmoa ikusita, distaleko gastrektomia egin zen. Azterketa histopatologikoak agerian utzi zuen mukosaren azpiko zelulen ugaltzea.""")).toDS().toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++----------------------------+------------------+ +|chunk |ner_label | ++----------------------------+------------------+ +|3 urteko mutiko bat |patient | +|nahasmendu |clinical_event | +|autismoaren espektroaren |clinical_condition| +|nahaste |clinical_event | +|gaixotasun |clinical_event | +|familiaren |patient | +|aurrekaririk |clinical_event | +|Mutilari |patient | +|komunikazio-nahaste |clinical_event | +|diagnostikatu |clinical_event | +|elkarrekintza |clinical_event | +|zailtasunak |clinical_event | +|prozesamendu sentsorial |clinical_event | +|Odol-analisiak |clinical_event | +|normalak |units_measurements| +|tiroidearen |bodypart | +|hormona estimulatzailea |clinical_event | +|TSH |clinical_event | +|hemoglobina |clinical_event | +|bolumen |clinical_event | +|MCV |clinical_event | +|ferritina |clinical_event | +|Goiko |bodypart | +|endoskopiak |clinical_event | +|mukosaren azpiko |bodypart | +|tumore |clinical_event | +|erakutsi |clinical_event | +|oztopoa |clinical_event | +|Estroma gastrointestinalaren|clinical_event | +|tumore |clinical_event | +|ikusita |clinical_event | +|distaleko |bodypart | +|gastrektomia |clinical_event | +|Azterketa |clinical_event | +|agerian |clinical_event | +|utzi |clinical_event | +|mukosaren azpiko zelulen |bodypart | +|ugaltzea |clinical_event | ++----------------------------+------------------+ + + +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_eu_clinical_case| +|Compatibility:|Healthcare NLP 4.2.8+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|eu| +|Size:|896.1 KB| + +## References + +The corpus used for model training is provided by European Clinical Case Corpus (E3C), a project aimed at offering a freely available multilingual corpus of semantically annotated clinical narratives. + +## Sample text from the training dataset + +3 urteko mutiko bat nahasmendu autistarekin unibertsitateko ospitaleko A pediatriako ospitalean. Ez du autismoaren espektroaren nahaste edo gaixotasun familiaren aurrekaririk. Mutilari komunikazio-nahaste larria diagnostikatu zioten, elkarrekintza sozialeko zailtasunak eta prozesamendu sentsorial atzeratua. Odol-analisiak normalak izan ziren (tiroidearen hormona estimulatzailea (TSH), hemoglobina, batez besteko bolumen corpuskularra (MCV) eta ferritina). Goiko endoskopiak mukosaren azpiko tumore bat ere erakutsi zuen, urdail-irteeren guztizko oztopoa eragiten zuena. Estroma gastrointestinalaren tumore baten susmoa ikusita, distaleko gastrektomia egin zen. Azterketa histopatologikoak agerian utzi zuen mukosaren azpiko zelulen ugaltzea. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + date_time 103.0 13.0 26.0 129.0 0.8879 0.7984 0.8408 +units_measurements 257.0 37.0 9.0 266.0 0.8741 0.9662 0.9179 +clinical_condition 20.0 22.0 33.0 53.0 0.4782 0.3774 0.4211 + patient 69.0 3.0 8.0 77.0 0.9583 0.8961 0.9262 + clinical_event 712.0 121.0 95.0 807.0 0.8547 0.8823 0.8683 + bodypart 182.0 33.0 15.0 197.0 0.8465 0.9239 0.8835 + macro - - - - - - 0.8096 + micro - - - - - - 0.8640 +``` \ No newline at end of file diff --git a/docs/_posts/gpirge/2023-02-06-ner_eu_clinical_condition_en.md b/docs/_posts/gpirge/2023-02-06-ner_eu_clinical_condition_en.md new file mode 100644 index 00000000000000..e417d46c363974 --- /dev/null +++ b/docs/_posts/gpirge/2023-02-06-ner_eu_clinical_condition_en.md @@ -0,0 +1,153 @@ +--- +layout: model +title: Detect Clinical Conditions (ner_eu_clinical_condition) +author: John Snow Labs +name: ner_eu_clinical_condition +date: 2023-02-06 +tags: [en, clinical, licensed, ner] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.8 +spark_version: 3.0 +supported: true +annotator: MedicalNerModel +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained named entity recognition (NER) deep learning model for clinical conditions. The SparkNLP deep learning model (MedicalNerModel) is inspired by a former state of the art model for NER: Chiu & Nichols, Named Entity Recognition with Bidirectional LSTM-CNN. + +The corpus used for model training is provided by European Clinical Case Corpus (E3C), a project aimed at offering a freely available multilingual corpus of semantically annotated clinical narratives. + +## Predicted Entities + +`clinical_condition` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_eu_clinical_condition_en_4.2.8_3.0_1675718793293.zip){:.button.button-orange} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/clinical/models/ner_eu_clinical_condition_en_4.2.8_3.0_1675718793293.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentenceDetectorDL = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "en")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")\ + .setInputCols(["sentence","token"])\ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained('ner_eu_clinical_condition', "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverterInternal()\ + .setInputCols(["sentence", "token", "ner"])\ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[ + document_assembler, + sentenceDetectorDL, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["""Hyperparathyroidism was considered upon the fourth occasion. The history of weakness and generalized joint pains were present. He also had history of epigastric pain diagnosed informally as gastritis. He had previously had open reduction and internal fixation for the initial two fractures under general anesthesia. He sustained mandibular fracture."""]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentenceDetector = SentenceDetectorDLModel.pretrained() + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models") + .setInputCols(Array("sentence","token")) + .setOutputCol("embeddings") + +val ner_model = MedicalNerModel.pretrained("ner_eu_clinical_condition", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pipeline = new Pipeline().setStages(Array(documenter, sentenceDetector, tokenizer, word_embeddings, ner_model, ner_converter)) + +val data = Seq(Array("""Hyperparathyroidism was considered upon the fourth occasion. The history of weakness and generalized joint pains were present. He also had history of epigastric pain diagnosed informally as gastritis. He had previously had open reduction and internal fixation for the initial two fractures under general anesthesia. He sustained mandibular fracture.""")).toDS().toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++-----------------------+------------------+ +|chunk |ner_label | ++-----------------------+------------------+ +|Hyperparathyroidism |clinical_condition| +|weakness |clinical_condition| +|generalized joint pains|clinical_condition| +|epigastric pain |clinical_condition| +|gastritis |clinical_condition| +|fractures |clinical_condition| +|anesthesia |clinical_condition| +|mandibular fracture |clinical_condition| ++-----------------------+------------------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_eu_clinical_condition| +|Compatibility:|Healthcare NLP 4.2.8+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|851.3 KB| + +## References + +The corpus used for model training is provided by European Clinical Case Corpus (E3C), a project aimed at offering a freely available multilingual corpus of semantically annotated clinical narratives. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + clinical_event 230.0 28.0 70.0 300.0 0.8915 0.7667 0.8244 + macro - - - - - - 0.8244 + micro - - - - - - 0.8244 +``` \ No newline at end of file diff --git a/docs/_posts/gpirge/2023-02-06-ner_eu_clinical_condition_es.md b/docs/_posts/gpirge/2023-02-06-ner_eu_clinical_condition_es.md new file mode 100644 index 00000000000000..39305dec73d697 --- /dev/null +++ b/docs/_posts/gpirge/2023-02-06-ner_eu_clinical_condition_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Detect Clinical Conditions (ner_eu_clinical_condition - es) +author: John Snow Labs +name: ner_eu_clinical_condition +date: 2023-02-06 +tags: [es, clinical, licensed, ner, clinical_condition] +task: Named Entity Recognition +language: es +edition: Healthcare NLP 4.2.8 +spark_version: 3.0 +supported: true +annotator: MedicalNerModel +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained named entity recognition (NER) deep learning model for extracting clinical conditions from Spanish texts. The SparkNLP deep learning model (MedicalNerModel) is inspired by a former state of the art model for NER: Chiu & Nichols, Named Entity Recognition with Bidirectional LSTM-CNN. + +The corpus used for model training is provided by European Clinical Case Corpus (E3C), a project aimed at offering a freely available multilingual corpus of semantically annotated clinical narratives. + +## Predicted Entities + +`clinical_condition` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_eu_clinical_condition_es_4.2.8_3.0_1675721390266.zip){:.button.button-orange} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/clinical/models/ner_eu_clinical_condition_es_4.2.8_3.0_1675721390266.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentenceDetectorDL = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel.pretrained("w2v_cc_300d","es")\ + .setInputCols(["sentence","token"])\ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained('ner_eu_clinical_condition', "es", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverterInternal()\ + .setInputCols(["sentence", "token", "ner"])\ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[ + document_assembler, + sentenceDetectorDL, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["""La exploración abdominal revela una cicatriz de laparotomía media infraumbilical, la presencia de ruidos disminuidos, y dolor a la palpación de manera difusa sin claros signos de irritación peritoneal. No existen hernias inguinales o crurales."""]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("w2v_cc_300d","es") + .setInputCols(Array("sentence","token")) + .setOutputCol("embeddings") + +val ner_model = MedicalNerModel.pretrained("ner_eu_clinical_condition", "es", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pipeline = new Pipeline().setStages(Array(documenter, sentenceDetector, tokenizer, word_embeddings, ner_model, ner_converter)) + +val data = Seq(Array("""La exploración abdominal revela una cicatriz de laparotomía media infraumbilical, la presencia de ruidos disminuidos, y dolor a la palpación de manera difusa sin claros signos de irritación peritoneal. No existen hernias inguinales o crurales.""")).toDS().toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++--------------------+------------------+ +|chunk |ner_label | ++--------------------+------------------+ +|cicatriz |clinical_condition| +|dolor a la palpación|clinical_condition| +|signos |clinical_condition| +|irritación |clinical_condition| +|hernias inguinales |clinical_condition| +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_eu_clinical_condition| +|Compatibility:|Healthcare NLP 4.2.8+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|es| +|Size:|898.1 KB| + +## References + +The corpus used for model training is provided by European Clinical Case Corpus (E3C), a project aimed at offering a freely available multilingual corpus of semantically annotated clinical narratives. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 +clinical_condition 354.0 42.0 84.0 438.0 0.8939 0.8082 0.8489 + macro - - - - - - 0.8489 + micro - - - - - - 0.8489 +``` \ No newline at end of file diff --git a/docs/_posts/gpirge/2023-02-06-ner_eu_clinical_condition_eu.md b/docs/_posts/gpirge/2023-02-06-ner_eu_clinical_condition_eu.md new file mode 100644 index 00000000000000..6f25f06fe7d8d3 --- /dev/null +++ b/docs/_posts/gpirge/2023-02-06-ner_eu_clinical_condition_eu.md @@ -0,0 +1,153 @@ +--- +layout: model +title: Detect Clinical Conditions (ner_eu_clinical_case - eu) +author: John Snow Labs +name: ner_eu_clinical_condition +date: 2023-02-06 +tags: [eu, clinical, licensed, ner, clinical_condition] +task: Named Entity Recognition +language: eu +edition: Healthcare NLP 4.2.8 +spark_version: 3.0 +supported: true +annotator: MedicalNerModel +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained named entity recognition (NER) deep learning model for extracting clinical conditions from Basque texts. The SparkNLP deep learning model (MedicalNerModel) is inspired by a former state of the art model for NER: Chiu & Nichols, Named Entity Recognition with Bidirectional LSTM-CNN. + +The corpus used for model training is provided by European Clinical Case Corpus (E3C), a project aimed at offering a freely available multilingual corpus of semantically annotated +clinical narratives. + +## Predicted Entities + +`clinical_condition` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_eu_clinical_condition_eu_4.2.8_3.0_1675723038941.zip){:.button.button-orange} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/clinical/models/ner_eu_clinical_condition_eu_4.2.8_3.0_1675723038941.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentenceDetectorDL = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel.pretrained("w2v_cc_300d","eu")\ + .setInputCols(["sentence","token"])\ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained('ner_eu_clinical_condition', "eu", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverterInternal()\ + .setInputCols(["sentence", "token", "ner"])\ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[ + document_assembler, + sentenceDetectorDL, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["""Gertaera honetatik bi hilabetetara, umea Larrialdietako Zerbitzura dator 4 egunetan zehar buruko mina eta bekokiko hantura azaltzeagatik, sukarrik izan gabe. Miaketan, haztapen mingarria duen bekokiko hantura bigunaz gain, ez da beste zeinurik azaltzen. Polakiuria eta tenesmo arina ere izan zuen egun horretan hematuriarekin batera. Geroztik sintomarik gabe dago."""]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("w2v_cc_300d","eu") + .setInputCols(Array("sentence","token")) + .setOutputCol("embeddings") + +val ner_model = MedicalNerModel.pretrained("ner_eu_clinical_condition", "eu", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pipeline = new Pipeline().setStages(Array(documenter, sentenceDetector, tokenizer, word_embeddings, ner_model, ner_converter)) + +val data = Seq(Array("""Gertaera honetatik bi hilabetetara, umea Larrialdietako Zerbitzura dator 4 egunetan zehar buruko mina eta bekokiko hantura azaltzeagatik, sukarrik izan gabe. Miaketan, haztapen mingarria duen bekokiko hantura bigunaz gain, ez da beste zeinurik azaltzen. Polakiuria eta tenesmo arina ere izan zuen egun horretan hematuriarekin batera. Geroztik sintomarik gabe dago.""")).toDS().toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++----------+------------------+ +|chunk |ner_label | ++----------+------------------+ +|mina |clinical_condition| +|hantura |clinical_condition| +|sukarrik |clinical_condition| +|mingarria |clinical_condition| +|hantura |clinical_condition| +|Polakiuria|clinical_condition| +|sintomarik|clinical_condition| ++----------+------------------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_eu_clinical_condition| +|Compatibility:|Healthcare NLP 4.2.8+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|eu| +|Size:|899.6 KB| + +## References + +The corpus used for model training is provided by European Clinical Case Corpus (E3C), a project aimed at offering a freely available multilingual corpus of semantically annotated clinical narratives. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 +clinical_condition 45.0 4.0 13.0 58.0 0.9184 0.7759 0.8411 + macro - - - - - - 0.8411 + micro - - - - - - 0.8411 +``` \ No newline at end of file diff --git a/docs/_posts/gpirge/2023-02-06-ner_eu_clinical_condition_fr.md b/docs/_posts/gpirge/2023-02-06-ner_eu_clinical_condition_fr.md new file mode 100644 index 00000000000000..0b93dfb9a2968e --- /dev/null +++ b/docs/_posts/gpirge/2023-02-06-ner_eu_clinical_condition_fr.md @@ -0,0 +1,152 @@ +--- +layout: model +title: Detect Clinical Conditions (ner_eu_clinical_case - fr) +author: John Snow Labs +name: ner_eu_clinical_condition +date: 2023-02-06 +tags: [fr, clinical, licensed, ner, clinical_condition] +task: Named Entity Recognition +language: fr +edition: Healthcare NLP 4.2.8 +spark_version: 3.0 +supported: true +annotator: MedicalNerModel +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained named entity recognition (NER) deep learning model for extracting clinical conditions from French texts. The SparkNLP deep learning model (MedicalNerModel) is inspired by a former state of the art model for NER: Chiu & Nichols, Named Entity Recognition with Bidirectional LSTM-CNN. + +The corpus used for model training is provided by European Clinical Case Corpus (E3C), a project aimed at offering a freely available multilingual corpus of semantically annotated clinical narratives. + +## Predicted Entities + +`clinical_condition` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_eu_clinical_condition_fr_4.2.8_3.0_1675725809666.zip){:.button.button-orange} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/clinical/models/ner_eu_clinical_condition_fr_4.2.8_3.0_1675725809666.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentenceDetectorDL = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel.pretrained("w2v_cc_300d","fr")\ + .setInputCols(["sentence","token"])\ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained('ner_eu_clinical_condition', "fr", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverterInternal()\ + .setInputCols(["sentence", "token", "ner"])\ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[ + document_assembler, + sentenceDetectorDL, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["""Il aurait présenté il y’ a environ 30 ans des ulcérations génitales non traitées spontanément guéries. L’interrogatoire retrouvait une toux sèche depuis trois mois, des douleurs rétro-sternales constrictives, une dyspnée stade III de la NYHA et un contexte d’ apyrexie. Sur ce tableau s’ est greffé des œdèmes des membres inférieurs puis un tableau d’ anasarque d’ où son hospitalisation en cardiologie pour décompensation cardiaque globale."""]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("w2v_cc_300d","fr") + .setInputCols(Array("sentence","token")) + .setOutputCol("embeddings") + +val ner_model = MedicalNerModel.pretrained("ner_eu_clinical_condition", "fr", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pipeline = new Pipeline().setStages(Array(documenter, sentenceDetector, tokenizer, word_embeddings, ner_model, ner_converter)) + +val data = Seq(Array("""Il aurait présenté il y’ a environ 30 ans des ulcérations génitales non traitées spontanément guéries. L’interrogatoire retrouvait une toux sèche depuis trois mois, des douleurs rétro-sternales constrictives, une dyspnée stade III de la NYHA et un contexte d’ apyrexie. Sur ce tableau s’ est greffé des œdèmes des membres inférieurs puis un tableau d’ anasarque d’ où son hospitalisation en cardiologie pour décompensation cardiaque globale.""")).toDS().toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++------------------------+------------------+ +|chunk |ner_label | ++------------------------+------------------+ +|ulcérations |clinical_condition| +|toux sèche |clinical_condition| +|douleurs |clinical_condition| +|dyspnée |clinical_condition| +|apyrexie |clinical_condition| +|anasarque |clinical_condition| +|décompensation cardiaque|clinical_condition| ++------------------------+------------------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_eu_clinical_condition| +|Compatibility:|Healthcare NLP 4.2.8+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|fr| +|Size:|899.9 KB| + +## References + +The corpus used for model training is provided by European Clinical Case Corpus (E3C), a project aimed at offering a freely available multilingual corpus of semantically annotated clinical narratives. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + clinical_event 269.0 51.0 52.0 321.0 0.8406 0.8380 0.8393 + macro - - - - - - 0.8393 + micro - - - - - - 0.8393 +``` \ No newline at end of file diff --git a/docs/_posts/gpirge/2023-02-06-ner_eu_clinical_condition_it.md b/docs/_posts/gpirge/2023-02-06-ner_eu_clinical_condition_it.md new file mode 100644 index 00000000000000..109bdbffa230e6 --- /dev/null +++ b/docs/_posts/gpirge/2023-02-06-ner_eu_clinical_condition_it.md @@ -0,0 +1,155 @@ +--- +layout: model +title: Detect Clinical Conditions (ner_eu_clinical_condition - it) +author: John Snow Labs +name: ner_eu_clinical_condition +date: 2023-02-06 +tags: [it, clinical, licensed, ner, clinical_condition] +task: Named Entity Recognition +language: it +edition: Healthcare NLP 4.2.8 +spark_version: 3.0 +supported: true +annotator: MedicalNerModel +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained named entity recognition (NER) deep learning model for extracting clinical conditions from Italian texts. The SparkNLP deep learning model (MedicalNerModel) is inspired by a former state of the art model for NER: Chiu & Nichols, Named Entity Recognition with Bidirectional LSTM-CNN. + +The corpus used for model training is provided by European Clinical Case Corpus (E3C), a project aimed at offering a freely available multilingual corpus of semantically annotated clinical narratives. + +## Predicted Entities + +`clinical_condition` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_eu_clinical_condition_it_4.2.8_3.0_1675726754516.zip){:.button.button-orange} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/clinical/models/ner_eu_clinical_condition_it_4.2.8_3.0_1675726754516.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentenceDetectorDL = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel.pretrained("w2v_cc_300d","it")\ + .setInputCols(["sentence","token"])\ + .setOutputCol("embeddings") + +ner = MedicalNerModel.pretrained('ner_eu_clinical_condition', "it", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverterInternal()\ + .setInputCols(["sentence", "token", "ner"])\ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[ + document_assembler, + sentenceDetectorDL, + tokenizer, + word_embeddings, + ner, + ner_converter]) + +data = spark.createDataFrame([["""Donna, 64 anni, ricovero per dolore epigastrico persistente, irradiato a barra e posteriormente, associato a dispesia e anoressia. Poche settimane dopo compaiono, però, iperemia, intenso edema vulvare ed una esione ulcerativa sul lato sinistro della parete rettale che la RM mostra essere una fistola transfinterica. Questi trattamenti determinano un miglioramento dell’ infiammazione ed una riduzione dell’ ulcera, ma i condilomi permangono inalterati."""]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("w2v_cc_300d","it") + .setInputCols(Array("sentence","token")) + .setOutputCol("embeddings") + +val ner_model = MedicalNerModel.pretrained("ner_eu_clinical_condition", "it", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pipeline = new Pipeline().setStages(Array(documenter, sentenceDetector, tokenizer, word_embeddings, ner_model, ner_converter)) + +val data = Seq(Array("""Donna, 64 anni, ricovero per dolore epigastrico persistente, irradiato a barra e posteriormente, associato a dispesia e anoressia. Poche settimane dopo compaiono, però, iperemia, intenso edema vulvare ed una esione ulcerativa sul lato sinistro della parete rettale che la RM mostra essere una fistola transfinterica. Questi trattamenti determinano un miglioramento dell’ infiammazione ed una riduzione dell’ ulcera, ma i condilomi permangono inalterati.""")).toDS().toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++----------------------+------------------+ +|chunk |ner_label | ++----------------------+------------------+ +|dolore epigastrico |clinical_condition| +|anoressia |clinical_condition| +|iperemia |clinical_condition| +|edema |clinical_condition| +|fistola transfinterica|clinical_condition| +|infiammazione |clinical_condition| ++----------------------+------------------+ + + + + +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_eu_clinical_condition| +|Compatibility:|Healthcare NLP 4.2.8+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|it| +|Size:|903.5 KB| + +## References + +The corpus used for model training is provided by European Clinical Case Corpus (E3C), a project aimed at offering a freely available multilingual corpus of semantically annotated clinical narratives. + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 +clinical_condition 208.0 35.0 46.0 254.0 0.8560 0.8189 0.8370 + macro - - - - - - 0.8370 + micro - - - - - - 0.8370 +``` \ No newline at end of file From b483fa462e41b02356c8edd5545c4fe127e2ed67 Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Fri, 10 Feb 2023 15:45:18 +0700 Subject: [PATCH 54/57] Add model 2023-02-09-rxnorm_drug_brandname_mapper_en (#13493) --- ...3-02-09-rxnorm_drug_brandname_mapper_en.md | 148 ++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 docs/_posts/Ahmetemintek/2023-02-09-rxnorm_drug_brandname_mapper_en.md diff --git a/docs/_posts/Ahmetemintek/2023-02-09-rxnorm_drug_brandname_mapper_en.md b/docs/_posts/Ahmetemintek/2023-02-09-rxnorm_drug_brandname_mapper_en.md new file mode 100644 index 00000000000000..d12d1677eb0116 --- /dev/null +++ b/docs/_posts/Ahmetemintek/2023-02-09-rxnorm_drug_brandname_mapper_en.md @@ -0,0 +1,148 @@ +--- +layout: model +title: Mapping RxNorm and RxNorm Extension Codes with Corresponding Drug Brand Names +author: John Snow Labs +name: rxnorm_drug_brandname_mapper +date: 2023-02-09 +tags: [chunk_mappig, rxnorm, drug_brand_name, rxnorm_extension, en, clinical, licensed] +task: Chunk Mapping +language: en +edition: Healthcare NLP 4.3.0 +spark_version: 3.0 +supported: true +annotator: ChunkMapperModel +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This pretrained model maps RxNorm and RxNorm Extension codes with their corresponding drug brand names. It returns 2 types of brand names for the corresponding RxNorm or RxNorm Extension code. + +## Predicted Entities + +`rxnorm_brandname`, `rxnorm_extension_brandname` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/26.Chunk_Mapping.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/rxnorm_drug_brandname_mapper_en_4.3.0_3.0_1675966478332.zip){:.button.button-orange} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/clinical/models/rxnorm_drug_brandname_mapper_en_4.3.0_3.0_1675966478332.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("chunk") + +sbert_embedder = BertSentenceEmbeddings\ + .pretrained("sbiobert_base_cased_mli", "en","clinical/models")\ + .setInputCols(["chunk"])\ + .setOutputCol("sbert_embeddings") + +rxnorm_resolver = SentenceEntityResolverModel\ + .pretrained("sbiobertresolve_rxnorm_augmented", "en", "clinical/models")\ + .setInputCols(["chunk", "sbert_embeddings"])\ + .setOutputCol("rxnorm_code")\ + .setDistanceFunction("EUCLIDEAN") + +resolver2chunk = Resolution2Chunk()\ + .setInputCols(["rxnorm_code"]) \ + .setOutputCol("rxnorm_chunk")\ + +chunkerMapper = ChunkMapperModel.pretrained("rxnorm_drug_brandname_mapper", "en", "clinical/models")\ + .setInputCols(["rxnorm_chunk"])\ + .setOutputCol("mappings")\ + .setRels(["rxnorm_brandname", "rxnorm_extension_brandname"]) + + +pipeline = Pipeline( + stages = [ + documentAssembler, + sbert_embedder, + rxnorm_resolver, + resolver2chunk, + chunkerMapper + ]) + +model = pipeline.fit(spark.createDataFrame([['']]).toDF('text')) + +pipeline = LightPipeline(model) + +result = pipeline.fullAnnotate(['metformin', 'advil']) + +``` +```scala +val documentAssembler = new DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("chunk") + +val sbert_embedder = BertSentenceEmbeddings\ + .pretrained("sbiobert_base_cased_mli", "en","clinical/models")\ + .setInputCols(["chunk"])\ + .setOutputCol("sbert_embeddings") + +val rxnorm_resolver = SentenceEntityResolverModel\ + .pretrained("sbiobertresolve_rxnorm_augmented", "en", "clinical/models")\ + .setInputCols(["chunk", "sbert_embeddings"])\ + .setOutputCol("rxnorm_code")\ + .setDistanceFunction("EUCLIDEAN") + +val resolver2chunk = new Resolution2Chunk()\ + .setInputCols(["rxnorm_code"]) \ + .setOutputCol("rxnorm_chunk")\ + +val chunkerMapper = ChunkMapperModel.pretrained("rxnorm_drug_brandname_mapper", "en", "clinical/models")\ + .setInputCols(["rxnorm_chunk"])\ + .setOutputCol("mappings")\ + .setRels(["rxnorm_brandname", "rxnorm_extension_brandname"]) + + + +val pipeline = new Pipeline(stages = Array( +documentAssembler, +sbert_embedder, +rxnorm_resolver, +resolver2chunk +chunkerMapper +)) + +val data = Seq(Array("metformin", "advil")).toDS.toDF("text") + +val result= pipeline.fit(data).transform(data) + +``` +
+ +## Results + +```bash ++--------------+-------------+--------------------------------------------------+--------------------------+ +| drug_name|rxnorm_result| mapping_result| relation | ++--------------+-------------+--------------------------------------------------+--------------------------+ +| metformin| 6809|Actoplus Met (metformin):::Avandamet (metformin...| rxnorm_brandname| +| metformin| 6809|A FORMIN (metformin):::ABERIN MAX (metformin)::...|rxnorm_extension_brandname| +| advil| 153010| Advil (Advil)| rxnorm_brandname| +| advil| 153010| NONE|rxnorm_extension_brandname| ++--------------+-------------+--------------------------------------------------+--------------------------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|rxnorm_drug_brandname_mapper| +|Compatibility:|Healthcare NLP 4.3.0+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[rxnorm_chunk]| +|Output Labels:|[mappings]| +|Language:|en| +|Size:|4.0 MB| \ No newline at end of file From b4bdec67bfaac597787dbf7b676554a7e6938a93 Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Fri, 10 Feb 2023 15:46:00 +0700 Subject: [PATCH 55/57] 2023-02-10-ner_sdoh_social_environment_wip_en (#13496) --- ...2023-02-10-ner_sdoh_demographics_wip_en.md | 166 ++++++++++++++++++ ...10-ner_sdoh_income_social_status_wip_en.md | 157 +++++++++++++++++ ...2-10-ner_sdoh_social_environment_wip_en.md | 161 +++++++++++++++++ 3 files changed, 484 insertions(+) create mode 100644 docs/_posts/Meryem1425/2023-02-10-ner_sdoh_demographics_wip_en.md create mode 100644 docs/_posts/Meryem1425/2023-02-10-ner_sdoh_income_social_status_wip_en.md create mode 100644 docs/_posts/Meryem1425/2023-02-10-ner_sdoh_social_environment_wip_en.md diff --git a/docs/_posts/Meryem1425/2023-02-10-ner_sdoh_demographics_wip_en.md b/docs/_posts/Meryem1425/2023-02-10-ner_sdoh_demographics_wip_en.md new file mode 100644 index 00000000000000..0400a7a9b1d975 --- /dev/null +++ b/docs/_posts/Meryem1425/2023-02-10-ner_sdoh_demographics_wip_en.md @@ -0,0 +1,166 @@ +--- +layout: model +title: Extract Demographic Entities from Social Determinants of Health Texts +author: John Snow Labs +name: ner_sdoh_demographics_wip +date: 2023-02-10 +tags: [licensed, clinical, social_determinants, en, ner, demographics, sdoh, public_health] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.8 +spark_version: 3.0 +supported: true +annotator: MedicalNerModel +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts demographic information related to Social Determinants of Health from various kinds of biomedical documents. + +## Predicted Entities + +`Family_Member`, `Age`, `Gender`, `Geographic_Entity`, `Race_Ethnicity`, `Language`, `Spiritual_Beliefs` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_sdoh_demographics_wip_en_4.2.8_3.0_1675998706136.zip){:.button.button-orange} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/clinical/models/ner_sdoh_demographics_wip_en_4.2.8_3.0_1675998706136.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "en")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +clinical_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") + +ner_model = MedicalNerModel.pretrained("ner_sdoh_demographics_wip", "en", "clinical/models")\ + .setInputCols(["sentence", "token", "embeddings"])\ + .setOutputCol("ner") + +ner_converter = NerConverterInternal()\ + .setInputCols(["sentence", "token", "ner"])\ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[ + document_assembler, + sentence_detector, + tokenizer, + clinical_embeddings, + ner_model, + ner_converter + ]) + +sample_texts = ["SOCIAL HISTORY: He is a former tailor from Korea.", + "He lives alone,single and no children.", + "Pt is a 61 years old married, Caucasian, Catholic woman. Pt speaks English reasonably well."] + + +data = spark.createDataFrame(sample_texts, StringType()).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "en") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val clinical_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner_model = MedicalNerModel.pretrained("ner_sdoh_demographics_wip", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pipeline = new Pipeline().setStages(Array( + document_assembler, + sentence_detector, + tokenizer, + clinical_embeddings, + ner_model, + ner_converter +)) + +val data = Seq("Pt is a 61 years old married, Caucasian, Catholic woman. Pt speaks English reasonably well.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++-----------------+-----+---+------------+ +|ner_label |begin|end|chunk | ++-----------------+-----+---+------------+ +|Gender |16 |17 |He | +|Geographic_Entity|43 |47 |Korea | +|Gender |0 |1 |He | +|Family_Member |29 |36 |children | +|Age |8 |19 |61 years old| +|Race_Ethnicity |30 |38 |Caucasian | +|Spiritual_Beliefs|41 |48 |Catholic | +|Gender |50 |54 |woman | +|Language |67 |73 |English | ++-----------------+-----+---+------------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_sdoh_demographics_wip| +|Compatibility:|Healthcare NLP 4.2.8+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|858.4 KB| + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + Age 1346.0 73.0 74.0 1420.0 0.948555 0.947887 0.948221 +Spiritual_Beliefs 100.0 13.0 16.0 116.0 0.884956 0.862069 0.873362 + Family_Member 4468.0 134.0 43.0 4511.0 0.970882 0.990468 0.980577 + Race_Ethnicity 56.0 0.0 13.0 69.0 1.000000 0.811594 0.896000 + Gender 9825.0 67.0 247.0 10072.0 0.993227 0.975477 0.984272 +Geographic_Entity 225.0 9.0 29.0 254.0 0.961538 0.885827 0.922131 + Language 51.0 9.0 5.0 56.0 0.850000 0.910714 0.879310 +``` diff --git a/docs/_posts/Meryem1425/2023-02-10-ner_sdoh_income_social_status_wip_en.md b/docs/_posts/Meryem1425/2023-02-10-ner_sdoh_income_social_status_wip_en.md new file mode 100644 index 00000000000000..86f5568e5f4ee6 --- /dev/null +++ b/docs/_posts/Meryem1425/2023-02-10-ner_sdoh_income_social_status_wip_en.md @@ -0,0 +1,157 @@ +--- +layout: model +title: Extract Income and Social Status Entities from Social Determinants of Health Texts +author: John Snow Labs +name: ner_sdoh_income_social_status_wip +date: 2023-02-10 +tags: [licensed, clinical, social_determinants, en, ner, income, social_status, sdoh, public_health] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.8 +spark_version: 3.0 +supported: true +annotator: MedicalNerModel +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts income and social status information related to Social Determinants of Health from various kinds of biomedical documents. + +## Predicted Entities + +`Education`, `Marital_Status`, `Financial_Status`, `Population_Group`, `Employment` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_sdoh_income_social_status_wip_en_4.2.8_3.0_1675999206708.zip){:.button.button-orange} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/clinical/models/ner_sdoh_income_social_status_wip_en_4.2.8_3.0_1675999206708.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "en")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +clinical_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") + +ner_model = MedicalNerModel.pretrained("ner_sdoh_income_social_status_wip", "en", "clinical/models")\ + .setInputCols(["sentence", "token", "embeddings"])\ + .setOutputCol("ner") + +ner_converter = NerConverterInternal()\ + .setInputCols(["sentence", "token", "ner"])\ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[ + document_assembler, + sentence_detector, + tokenizer, + clinical_embeddings, + ner_model, + ner_converter + ]) + +sample_texts = ["Pt is described as divorced and pleasant when approached but keeps to himself. Pt is working as a plumber, but he gets financial diffuculties. He has a son student at college. His family is imigrant for 2 years."] + +data = spark.createDataFrame(sample_texts, StringType()).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "en") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val clinical_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner_model = MedicalNerModel.pretrained("ner_sdoh_income_social_status_wip", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pipeline = new Pipeline().setStages(Array( + document_assembler, + sentence_detector, + tokenizer, + clinical_embeddings, + ner_model, + ner_converter +)) + +val data = Seq("Pt is described as divorced and pleasant when approached but keeps to himself. Pt is working as a plumber, but he gets financial diffuculties. He has a son student at college. His family is imigrant for 2 years.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +## Results + +```bash ++-----------+----------------+-----+---+----------------------+ +|sentence_id|ner_label |begin|end|chunk | ++-----------+----------------+-----+---+----------------------+ +|0 |Marital_Status |19 |26 |divorced | +|1 |Employment |98 |104|plumber | +|1 |Financial_Status|119 |140|financial diffuculties| +|2 |Education |156 |162|student | +|2 |Education |167 |173|college | +|3 |Population_Group|190 |197|imigrant | ++-----------+----------------+-----+---+----------------------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_sdoh_income_social_status_wip| +|Compatibility:|Healthcare NLP 4.2.8+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|856.8 KB| + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + Education 95.0 20.0 18.0 113.0 0.826087 0.840708 0.833333 +Population_Group 41.0 0.0 5.0 46.0 1.000000 0.891304 0.942529 +Financial_Status 286.0 52.0 82.0 368.0 0.846154 0.777174 0.810198 + Employment 3968.0 142.0 215.0 4183.0 0.965450 0.948601 0.956952 + Marital_Status 167.0 1.0 7.0 174.0 0.994048 0.959770 0.976608 +``` \ No newline at end of file diff --git a/docs/_posts/Meryem1425/2023-02-10-ner_sdoh_social_environment_wip_en.md b/docs/_posts/Meryem1425/2023-02-10-ner_sdoh_social_environment_wip_en.md new file mode 100644 index 00000000000000..046ce924a564b0 --- /dev/null +++ b/docs/_posts/Meryem1425/2023-02-10-ner_sdoh_social_environment_wip_en.md @@ -0,0 +1,161 @@ +--- +layout: model +title: Detect SDOH of Social Environment +author: John Snow Labs +name: ner_sdoh_social_environment_wip +date: 2023-02-10 +tags: [licensed, clinical, social_determinants, en, ner, social, environment, sdoh, public_health] +task: Named Entity Recognition +language: en +edition: Healthcare NLP 4.2.8 +spark_version: 3.0 +supported: true +annotator: MedicalNerModel +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model extracts social environment terminologies related to Social Determinants of Health from various kinds of documents. + +## Predicted Entities + +`Social_Support`, `Chidhood_Event`, `Social_Exclusion`, `Violence_Abuse_Legal` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_sdoh_social_environment_wip_en_4.2.8_3.0_1675998295035.zip){:.button.button-orange} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/clinical/models/ner_sdoh_social_environment_wip_en_4.2.8_3.0_1675998295035.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "en")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +clinical_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") + +ner_model = MedicalNerModel.pretrained("ner_sdoh_social_environment_wip", "en", "clinical/models")\ + .setInputCols(["sentence", "token", "embeddings"])\ + .setOutputCol("ner") + +ner_converter = NerConverterInternal()\ + .setInputCols(["sentence", "token", "ner"])\ + .setOutputCol("ner_chunk") + +pipeline = Pipeline(stages=[ + document_assembler, + sentence_detector, + tokenizer, + clinical_embeddings, + ner_model, + ner_converter + ]) + +sample_texts = ["He is the primary caregiver.", + "There is some evidence of abuse.", + "She stated that she was in a safe environment in prison, but that her siblings lived in an unsafe neighborhood, she was very afraid for them and witnessed their ostracism by other people.", + "Medical history: Jane was born in a low - income household and experienced significant trauma during her childhood, including physical and emotional abuse."] + +data = spark.createDataFrame(sample_texts, StringType()).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "en") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val clinical_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner_model = MedicalNerModel.pretrained("ner_sdoh_social_environment_wip", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverterInternal() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pipeline = new Pipeline().setStages(Array( + document_assembler, + sentence_detector, + tokenizer, + clinical_embeddings, + ner_model, + ner_converter +)) + +val data = Seq("Medical history: Jane was born in a low - income household and experienced significant trauma during her childhood, including physical and emotional abuse.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) + +``` +
+ +## Results + +```bash ++--------------------+-----+---+---------------------------+ +|ner_label |begin|end|chunk | ++--------------------+-----+---+---------------------------+ +|Social_Support |10 |26 |primary caregiver | +|Violence_Abuse_Legal|26 |30 |abuse | +|Violence_Abuse_Legal|49 |54 |prison | +|Social_Exclusion |161 |169|ostracism | +|Chidhood_Event |87 |113|trauma during her childhood| +|Violence_Abuse_Legal|139 |153|emotional abuse | ++--------------------+-----+---+---------------------------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_sdoh_social_environment_wip| +|Compatibility:|Healthcare NLP 4.2.8+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Size:|858.7 KB| + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + Chidhood_Event 34.0 6.0 5.0 39.0 0.850000 0.871795 0.860759 + Social_Exclusion 45.0 6.0 12.0 57.0 0.882353 0.789474 0.833333 + Social_Support 1139.0 57.0 103.0 1242.0 0.952341 0.917069 0.934372 +Violence_Abuse_Legal 235.0 38.0 44.0 279.0 0.860806 0.842294 0.851449 +``` From 02ccbb8b53c2790fb23b4f084240ad78b2755917 Mon Sep 17 00:00:00 2001 From: Cabir C <64752006+Cabir40@users.noreply.github.com> Date: Fri, 10 Feb 2023 17:07:18 +0300 Subject: [PATCH 56/57] Update 2022-11-24-ner_oncology_anatomy_general_en.md --- .../2022-11-24-ner_oncology_anatomy_general_en.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_anatomy_general_en.md b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_anatomy_general_en.md index 193261541db729..0802eabcb27b80 100644 --- a/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_anatomy_general_en.md +++ b/docs/_posts/Ahmetemintek/2022-11-24-ner_oncology_anatomy_general_en.md @@ -25,14 +25,11 @@ Definitions of Predicted Entities: - `Anatomical_Site`: Relevant anatomical terms mentioned in text. - `Direction`: Directional and laterality terms, such as "left", "right", "bilateral", "upper" and "lower". - ## Predicted Entities `Anatomical_Site`, `Direction` -{:.btn-box} - {:.btn-box} [Live Demo](https://demo.johnsnowlabs.com/healthcare/NER_ONCOLOGY_CLINICAL/){:.button.button-orange} [Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} @@ -155,4 +152,4 @@ Anatomical_Site 2946 549 638 3584 0.84 0.82 0.83 Direction 864 209 120 984 0.81 0.88 0.84 macro_avg 3810 758 758 4568 0.82 0.85 0.84 micro_avg 3810 758 758 4568 0.83 0.83 0.83 -``` \ No newline at end of file +``` From a6e73716d5f6f2a2e666b40dd1cade537e35cd43 Mon Sep 17 00:00:00 2001 From: Cabir C <64752006+Cabir40@users.noreply.github.com> Date: Fri, 10 Feb 2023 19:29:44 +0300 Subject: [PATCH 57/57] Update 2023-01-06-redl_clinical_biobert_en.md --- docs/_posts/Cabir40/2023-01-06-redl_clinical_biobert_en.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/_posts/Cabir40/2023-01-06-redl_clinical_biobert_en.md b/docs/_posts/Cabir40/2023-01-06-redl_clinical_biobert_en.md index 58e6df5d17d12f..e60a538ef591b5 100644 --- a/docs/_posts/Cabir40/2023-01-06-redl_clinical_biobert_en.md +++ b/docs/_posts/Cabir40/2023-01-06-redl_clinical_biobert_en.md @@ -27,12 +27,8 @@ Extract relations like `TrIP` : a certain treatment has improved a medical probl {:.btn-box} [Live Demo](https://demo.johnsnowlabs.com/healthcare/RE_CLINICAL/){:.button.button-orange} [Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.Clinical_Relation_Extraction.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} -<<<<<<< HEAD -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/redl_clinical_biobert_en_4.2.4_3.0_1673020165617.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -======= [Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/redl_clinical_biobert_en_4.2.4_3.0_1673020165617.zip){:.button.button-orange.button-orange-trans.arr.button-icon.hidden} [Copy S3 URI](s3://auxdata.johnsnowlabs.com/clinical/models/redl_clinical_biobert_en_4.2.4_3.0_1673020165617.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ->>>>>>> master ## How to use @@ -225,4 +221,4 @@ TrIP 0.517 0.796 0.627 151 TrNAP 0.402 0.672 0.503 112 TrWP 0.257 0.824 0.392 109 Avg. 0.635 0.803 0.691 - -``` \ No newline at end of file +```