Skip to content

Commit

Permalink
Merge pull request #26 from NFDI4BIOIMAGE/test_functions
Browse files Browse the repository at this point in the history
Improved Sorting for correct results
  • Loading branch information
lea-33 authored Dec 4, 2024
2 parents aa8d315 + 356ea3f commit 56aa8be
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 112 deletions.
81 changes: 41 additions & 40 deletions Text_Embedding.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -711,7 +711,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 4,
"id": "d1d34f7a-f031-411a-b6c6-490637656a12",
"metadata": {},
"outputs": [
Expand All @@ -737,8 +737,7 @@
}
],
"source": [
"from pdf_utilities import text_extract_from_pdfs_2\n",
"text_extract_from_pdfs_2(downloads_folder=\"downloads\", yaml_file_path=\"dict_slides_text.yml\")"
"text_extract_from_pdfs(downloads_folder=\"downloads\", yaml_file_path=\"dict_slides_text.yml\")"
]
},
{
Expand All @@ -751,19 +750,20 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 6,
"id": "cf40f07a-564e-4fe0-ae00-d9359a5b275f",
"metadata": {},
"outputs": [],
"source": [
"import yaml\n",
"# Load the YAML file containing the image paths and corresponding text\n",
"with open(\"dict_slides_text.yml\", \"r\") as yaml_file:\n",
" slide_dict = yaml.safe_load(yaml_file)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 7,
"id": "99cdd053-00de-480d-bb78-5d69fcd94af5",
"metadata": {},
"outputs": [],
Expand All @@ -777,7 +777,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 8,
"id": "795b0f18-2cce-45a1-988e-2ace9c78eb82",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -836,7 +836,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 9,
"id": "84abc4a0-9923-4e16-b935-b9626afa4d00",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -1018,7 +1018,7 @@
"[863 rows x 5 columns]"
]
},
"execution_count": 14,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -1040,7 +1040,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 10,
"id": "bf3fc678-7671-4196-9ff2-8b8009da88fb",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -1254,7 +1254,7 @@
"[863 rows x 7 columns]"
]
},
"execution_count": 15,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -1292,7 +1292,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 11,
"id": "a3e17f41-8ab0-45fc-9d63-16693ef484d5",
"metadata": {},
"outputs": [],
Expand All @@ -1314,7 +1314,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 12,
"id": "b3542185-dcb4-4e47-9b05-c1f47596bbc2",
"metadata": {},
"outputs": [
Expand All @@ -1327,7 +1327,7 @@
"})"
]
},
"execution_count": 17,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -1342,14 +1342,14 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 13,
"id": "93b5bed4-bf95-4633-bb9a-4e80dc1875e3",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7569dfbeb2994220b3094bcc0e64a810",
"model_id": "8bf4a0d58f7a4591b415f426e6297e3c",
"version_major": 2,
"version_minor": 0
},
Expand All @@ -1363,7 +1363,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c349970513c74d18b0c1b0fb28f65c88",
"model_id": "2c52f5c9daa04eb48f72b4e4432248fe",
"version_major": 2,
"version_minor": 0
},
Expand All @@ -1377,10 +1377,10 @@
{
"data": {
"text/plain": [
"CommitInfo(commit_url='https://huggingface.co/datasets/lea-33/SlightInsight_Data/commit/050e29cec46ee34c302c897da81856f7234daa3d', commit_message='Upload dataset', commit_description='', oid='050e29cec46ee34c302c897da81856f7234daa3d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/lea-33/SlightInsight_Data', endpoint='https://huggingface.co', repo_type='dataset', repo_id='lea-33/SlightInsight_Data'), pr_revision=None, pr_num=None)"
"CommitInfo(commit_url='https://huggingface.co/datasets/lea-33/SlightInsight_Data/commit/ddee771036f208cb2551eaf580bb78de161c9285', commit_message='Upload dataset', commit_description='', oid='ddee771036f208cb2551eaf580bb78de161c9285', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/lea-33/SlightInsight_Data', endpoint='https://huggingface.co', repo_type='dataset', repo_id='lea-33/SlightInsight_Data'), pr_revision=None, pr_num=None)"
]
},
"execution_count": 18,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -1401,7 +1401,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 14,
"id": "359886fa-aa1b-4ec8-a841-f6d309e6c1d7",
"metadata": {},
"outputs": [
Expand All @@ -1419,12 +1419,13 @@
"source": [
"from datasets import Dataset, Features, Image\n",
"import os\n",
"from natsort import natsorted\n",
"\n",
"image_folder = \"downloads/images\"\n",
"\n",
"# List and filter only valid image files\n",
"valid_extensions = {\".png\", \".jpg\", \".jpeg\", \".bmp\", \".gif\", \".webp\"} # Add more extensions if needed\n",
"image_paths = sorted(\n",
"image_paths = natsorted(\n",
" [\n",
" os.path.join(image_folder, fname)\n",
" for fname in os.listdir(image_folder)\n",
Expand All @@ -1448,14 +1449,14 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 15,
"id": "3cc5a200-e290-4f13-ba59-7739f003002b",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5b52d499c95d48b999e2c7e742f9daf2",
"model_id": "57d06ed1b22a41878768c583596fc592",
"version_major": 2,
"version_minor": 0
},
Expand All @@ -1469,7 +1470,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "2cea9af38d9d413699d1b8a25933448d",
"model_id": "70fc9384b71441868ad30ef31b6a7460",
"version_major": 2,
"version_minor": 0
},
Expand All @@ -1483,7 +1484,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7e5089fe2c464d898cfb592673792b9b",
"model_id": "4f4de5c3a6804d48b9405e6647870af7",
"version_major": 2,
"version_minor": 0
},
Expand All @@ -1497,10 +1498,10 @@
{
"data": {
"text/plain": [
"CommitInfo(commit_url='https://huggingface.co/datasets/lea-33/SlideInsight_Images/commit/8326eb7d49c0600a9c483d6f01f4cd88111ca621', commit_message='Upload dataset', commit_description='', oid='8326eb7d49c0600a9c483d6f01f4cd88111ca621', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/lea-33/SlideInsight_Images', endpoint='https://huggingface.co', repo_type='dataset', repo_id='lea-33/SlideInsight_Images'), pr_revision=None, pr_num=None)"
"CommitInfo(commit_url='https://huggingface.co/datasets/lea-33/SlideInsight_Images/commit/c0a1d3e674e1b5f26383d3d88ab89038b8e29ae7', commit_message='Upload dataset', commit_description='', oid='c0a1d3e674e1b5f26383d3d88ab89038b8e29ae7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/lea-33/SlideInsight_Images', endpoint='https://huggingface.co', repo_type='dataset', repo_id='lea-33/SlideInsight_Images'), pr_revision=None, pr_num=None)"
]
},
"execution_count": 20,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -1520,7 +1521,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 16,
"id": "848bd41f-e758-49ec-a3e5-56a31f3f1a30",
"metadata": {},
"outputs": [],
Expand All @@ -1532,7 +1533,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 17,
"id": "e48e10d4-37a0-4e3c-a4a8-11e3d813734b",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -1738,7 +1739,7 @@
"[863 rows x 7 columns]"
]
},
"execution_count": 22,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -1749,7 +1750,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 18,
"id": "1b0ff383-0bf4-4569-8efc-064a34c47a27",
"metadata": {},
"outputs": [],
Expand All @@ -1772,14 +1773,14 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 19,
"id": "5e462705-3dd0-484d-baf6-1427159888d6",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7e3cb1bdf6a04243ad4c911e49128c1e",
"model_id": "8237360fb34f4a319ec113cf2267dde0",
"version_major": 2,
"version_minor": 0
},
Expand All @@ -1792,36 +1793,36 @@
}
],
"source": [
"import stackview\n",
"from skimage.io import imread\n",
"\n",
"dataset_name = \"lea-33/SlideInsight_Images\"\n",
"images = get_all_images(dataset_name, split=\"train\")"
]
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 20,
"id": "2d875b7a-5ebf-444b-bca7-61798b6651d2",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "143d5c91a62c4e1ebd71815f5fe00ab1",
"model_id": "259b2e8a546e44c5af7a493610bf0ed5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(HBox(children=(VBox(children=(VBox(children=(HBox(children=(VBox(children=(ImageWidget(height=3…"
]
},
"execution_count": 26,
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import stackview\n",
"from skimage.io import imread\n",
"\n",
"stackview.sliceplot(df_loaded, images, column_x=\"UMAP0\", column_y=\"UMAP1\", zoom_factor=1, zoom_spline_order=2)"
]
},
Expand All @@ -1835,7 +1836,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 1,
"id": "e8f89e87-10f7-4c2f-9685-5f839117d105",
"metadata": {},
"outputs": [],
Expand All @@ -1846,7 +1847,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 2,
"id": "6434e314-a75a-4747-8350-521963ed8c78",
"metadata": {},
"outputs": [
Expand Down
Loading

0 comments on commit 56aa8be

Please sign in to comment.