Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated fin/leg ChunkMapers model card #13482

Merged
merged 1 commit into from
Feb 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ ner_converter = nlp.NerConverter()\

CM = finance.ChunkMapperModel.pretrained('finmapper_nasdaq_ticker_stock_screener', 'en', 'finance/models')\
.setInputCols(["ner_chunk"])\
.setOutputCol("mappings")
.setOutputCol("mappings")\
.setEnableFuzzyMatching(True)\

pipeline = nlp.Pipeline().setStages([document_assembler,
tokenizer,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,56 +41,37 @@ It can be optionally combined with Entity Resolution to normalize first the name
```python

document_assembler = nlp.DocumentAssembler()\
.setInputCol('text')\
.setOutputCol('document')
.setInputCol('text')\
.setOutputCol('document')

tokenizer = nlp.Tokenizer()\
.setInputCols("document")\
.setOutputCol("token")
.setInputCols("document")\
.setOutputCol("token")

embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \
.setInputCols(["document", "token"]) \
.setOutputCol("embeddings")
.setInputCols(["document", "token"])\
.setOutputCol("embeddings")

ner_model = finance.NerModel.pretrained('finner_orgs_prods_alias', 'en', 'finance/models')\
.setInputCols(["document", "token", "embeddings"])\
.setOutputCol("ner")
.setInputCols(["document", "token", "embeddings"])\
.setOutputCol("ner")

ner_converter = nlp.NerConverter()\
.setInputCols(["document", "token", "ner"])\
.setOutputCol("ner_chunk")

# Optional: To normalize the ORG name using NASDAQ data before the mapping
##########################################################################
chunkToDoc = nlp.Chunk2Doc()\
.setInputCols("ner_chunk")\
.setOutputCol("ner_chunk_doc")

chunk_embeddings = nlp.UniversalSentenceEncoder.pretrained("tfhub_use_lg", "en")\
.setInputCols(["ner_chunk_doc"])\
.setOutputCol("chunk_embeddings")

use_er_model = finance.SentenceEntityResolverModel.pretrained('finel_nasdaq_data_company_name', 'en', 'finance/models')\
.setInputCols("chunk_embeddings")\
.setOutputCol('normalized')\
.setDistanceFunction("EUCLIDEAN")
##########################################################################

CM = finance.ChunkMapperModel()\
.pretrained('finmapper_nasdaq_companyname', 'en', 'finance/models')\
.setInputCols(["normalized"])\ #or ner_chunk without normalization
.setOutputCol("mappings")
.setInputCols(["document", "token", "ner"])\
.setOutputCol("ner_chunk")

CM = finance.ChunkMapperModel().pretrained('finmapper_nasdaq_companyname', 'en', 'finance/models')\
.setInputCols(["ner_chunk"])\
.setOutputCol("mappings")\
.setEnableFuzzyMatching(True)

pipeline = nlp.Pipeline().setStages([document_assembler,
tokenizer,
embeddings,
ner_model,
ner_converter,
chunkToDoc, # Optional for normalization
chunk_embeddings, # Optional for normalization
use_er_model, # Optional for normalization
CM])

tokenizer,
embeddings,
ner_model,
ner_converter,
CM])

text = """Altaba Inc. is a company which ..."""

test_data = spark.createDataFrame([[text]]).toDF("text")
Expand All @@ -107,7 +88,13 @@ lp.fullAnnotate(text)
## Results

```bash
[Row(mappings=[Row(annotatorType='labeled_dependency', begin=0, end=10, result='AABA', metadata={'sentence': '0', 'chunk': '0', 'entity': 'Altaba Inc.', 'relation': 'ticker', 'all_relations': ''}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=10, result='Altaba Inc.', metadata={'sentence': '0', 'chunk': '0', 'entity': 'Altaba Inc.', 'relation': 'company_name', 'all_relations': ''}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=10, result='Altaba', metadata={'sentence': '0', 'chunk': '0', 'entity': 'Altaba Inc.', 'relation': 'short_name', 'all_relations': ''}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=10, result='Asset Management', metadata={'sentence': '0', 'chunk': '0', 'entity': 'Altaba Inc.', 'relation': 'industry', 'all_relations': ''}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=10, result='Financial Services', metadata={'sentence': '0', 'chunk': '0', 'entity': 'Altaba Inc.', 'relation': 'sector', 'all_relations': ''}, embeddings=[])])]
{
"ticker": "AABA",
"company_name": "Altaba Inc.",
"short_name": "Altaba",
"industry": "Asset Management",
"sector": "Financial Services"
}
```

{:.model-param}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,31 +46,36 @@ tokenizer = nlp.Tokenizer()\
.setOutputCol("token")

tokenClassifier = nlp.RoBertaForTokenClassification.pretrained("finner_roberta_ticker", "en", "finance/models")\
.setInputCols(["document",'token'])\
.setOutputCol("ner")
.setInputCols(["document",'token'])\
.setOutputCol("ner")

ner_converter = nlp.NerConverter()\
.setInputCols(["document", "token", "ner"])\
.setOutputCol("ner_chunk")

CM = finance.ChunkMapperModel()\
.pretrained('finmapper_nasdaq_companyname', 'en', 'finance/models')\
CM = finance.ChunkMapperModel().pretrained('finmapper_nasdaq_ticker', 'en', 'finance/models')\
.setInputCols(["ner_chunk"])\
.setOutputCol("mappings")\
.setRel('company_name')

pipeline = Pipeline().setStages([document_assembler,
tokenizer,
tokenClassifier,
ner_converter,
CM])
.setRel('company_name')\
.setEnableFuzzyMatching(True)

pipeline = nlp.Pipeline().setStages(
[
document_assembler,
tokenizer,
tokenClassifier,
ner_converter,
CM
]
)

text = ["""There are some serious purchases and sales of AMZN stock today."""]

test_data = spark.createDataFrame([text]).toDF("text")

model = pipeline.fit(test_data)
res= model.transform(test_data)

res = model.transform(test_data)

res.select('mappings').collect()
```
Expand All @@ -80,7 +85,13 @@ res.select('mappings').collect()
## Results

```bash
[Row(mappings=[Row(annotatorType='labeled_dependency', begin=46, end=49, result='AMZN', metadata={'sentence': '0', 'chunk': '0', 'entity': 'AMZN', 'relation': 'ticker', 'all_relations': ''}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=46, end=49, result='Amazon.com Inc.', metadata={'sentence': '0', 'chunk': '0', 'entity': 'AMZN', 'relation': 'company_name', 'all_relations': ''}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=46, end=49, result='Amazon.com', metadata={'sentence': '0', 'chunk': '0', 'entity': 'AMZN', 'relation': 'short_name', 'all_relations': ''}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=46, end=49, result='Retail - Apparel & Specialty', metadata={'sentence': '0', 'chunk': '0', 'entity': 'AMZN', 'relation': 'industry', 'all_relations': ''}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=46, end=49, result='Consumer Cyclical', metadata={'sentence': '0', 'chunk': '0', 'entity': 'AMZN', 'relation': 'sector', 'all_relations': ''}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=57, end=61, result='NONE', metadata={'sentence': '0', 'chunk': '1', 'entity': 'today'}, embeddings=[])])]
{
"ticker": "AMZN",
"company_name": "Amazon.com Inc.",
"short_name": "Amazon.com",
"industry": "Retail - Apparel & Specialty",
"sector": "Consumer Cyclical"
}
```

{:.model-param}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,77 +45,74 @@ IMPORTANT: Chunk Mappers work with exact matches, so before using Chunk Mapping,
{% include programmingLanguageSelectScalaPythonNLU.html %}

```python
documentAssembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
document_assembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

tokenizer = nlp.Tokenizer()\
.setInputCols(["document"])\
.setOutputCol("token")
.setInputCols(["document"])\
.setOutputCol("token")

embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \
.setInputCols(["document", "token"]) \
.setOutputCol("embeddings")
.setInputCols(["document", "token"]) \
.setOutputCol("embeddings")

ner_model = finance.NerModel.pretrained("finner_orgs_prods_alias", "en", "finance/models")\
.setInputCols(["document", "token", "embeddings"])\
.setOutputCol("ner")
.setInputCols(["document", "token", "embeddings"])\
.setOutputCol("ner")

ner_converter = nlp.NerConverter()\
.setInputCols(["document","token","ner"])\
.setOutputCol("ner_chunk")

# Optional: To normalize the ORG name using NASDAQ data before the mapping
##########################################################################
chunkToDoc = nlp.Chunk2Doc()\
.setInputCols("ner_chunk")\
.setOutputCol("ner_chunk_doc")

chunk_embeddings = nlp.UniversalSentenceEncoder.pretrained("tfhub_use", "en") \
.setInputCols("ner_chunk_doc") \
.setOutputCol("sentence_embeddings")

use_er_model = finance.SentenceEntityResolverModel.pretrained("finel_edgar_company_name", "en", "finance/models") \
.setInputCols(["ner_chunk_doc", "sentence_embeddings"]) \
.setOutputCol("normalized")\
.setDistanceFunction("EUCLIDEAN")
##########################################################################

cm = finance.ChunkMapperModel()\
.pretrained("finmapper_edgar_companyname", "en", "finance/models")\
.setInputCols(["normalized"])\ # or ner_chunk for non normalized versions
.setOutputCol("mappings")
.setInputCols(["document","token","ner"])\
.setOutputCol("ner_chunk")

cm = finance.ChunkMapperModel().pretrained("finmapper_edgar_companyname", "en", "finance/models")\
.setInputCols(["ner_chunk"])\
.setOutputCol("mappings")\
.setEnableFuzzyMatching(True)

nlpPipeline = nlp.Pipeline(stages=[
documentAssembler,
tokenizer,
embeddings,
ner_model,
ner_converter,
chunkToDoc,
chunk_embeddings,
use_er_model,
cm
document_assembler,
tokenizer,
embeddings,
ner_model,
ner_converter,
cm
])

text = """NIKE Inc is an American multinational corporation that is engaged in the design, development, manufacturing, and worldwide marketing and sales of footwear,
apparel, equipment, accessories, and services"""
text = """NIKE Inc is an American multinational corporation that is engaged in the design, development, manufacturing, and worldwide marketing and sales of footwear, apparel, equipment, accessories, and services"""

test_data = spark.createDataFrame([[text]]).toDF("text")

model = nlpPipeline.fit(test_data)

lp = nlp.LightPipeline(model)

lp.annotate(text)
result = lp.fullAnnotate(text)
```

</div>

## Results

```bash
{"mappings": [["labeled_dependency", 0, 22, "Jamestown Invest 1, LLC", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "name", "all_relations": ""}], ["labeled_dependency", 0, 22, "REAL ESTATE INVESTMENT TRUSTS [6798]", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "sic", "all_relations": ""}], ["labeled_dependency", 0, 22, "6798", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "sic_code", "all_relations": ""}], ["labeled_dependency", 0, 22, "831529368", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "irs_number", "all_relations": ""}], ["labeled_dependency", 0, 22, "1231", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "fiscal_year_end", "all_relations": ""}], ["labeled_dependency", 0, 22, "GA", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "state_location", "all_relations": ""}], ["labeled_dependency", 0, 22, "DE", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "state_incorporation", "all_relations": ""}], ["labeled_dependency", 0, 22, "PONCE CITY MARKET", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "business_street", "all_relations": ""}], ["labeled_dependency", 0, 22, "ATLANTA", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "business_city", "all_relations": ""}], ["labeled_dependency", 0, 22, "GA", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "business_state", "all_relations": ""}], ["labeled_dependency", 0, 22, "30308", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "business_zip", "all_relations": ""}], ["labeled_dependency", 0, 22, "7708051000", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "business_phone", "all_relations": ""}], ["labeled_dependency", 0, 22, "Jamestown Atlanta Invest 1, LLC", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "former_name", "all_relations": ""}], ["labeled_dependency", 0, 22, "20180824", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "former_name_date", "all_relations": ""}], ["labeled_dependency", 0, 22, "2019-11-21", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "date", "all_relations": "2019-10-24:::2019-11-25:::2019-11-12:::2022-01-13:::2022-03-31:::2022-04-11:::2022-07-12:::2022-06-30:::2021-01-14:::2021-04-06:::2021-03-31:::2021-04-28:::2021-06-30:::2021-09-10:::2021-09-22:::2021-09-30:::2021-10-08:::2020-03-16:::2021-12-30:::2020-04-06:::2020-04-29:::2020-06-12:::2020-07-20:::2020-07-07:::2020-07-28:::2020-07-31:::2020-09-09:::2020-09-25:::2020-10-08:::2020-11-12"}], ["labeled_dependency", 0, 22, "1751158", {"sentence": "0", "chunk": "0", "entity": "Jamestown Invest 1, LLC", "relation": "company_id", "all_relations": ""}]]}
{
"name": "NIKE, Inc.",
"sic": "RUBBER & PLASTICS FOOTWEAR [3021]",
"sic_code": "3021",
"irs_number": "930584541",
"fiscal_year_end": "531",
"state_location": "OR",
"state_incorporation": "OR",
"business_street": "ONE BOWERMAN DR",
"business_city": "BEAVERTON",
"business_state": "OR",
"business_zip": "97005-6453",
"business_phone": "5036713173",
"former_name": "NIKE INC",
"former_name_date": "19920703",
"date": "2022-01-06",
"company_id": "320187"
}
```

{:.model-param}
Expand Down
Loading