from sentence_transformers import CrossEncoder import gc from typing import List, Tuple from utils.hf_env import setup_hf_env from FlagEmbedding import LayerWiseFlagLLMReranker import numpy as np setup_hf_env() embeddings_models = [ "BAAI/bge-reranker-base", "BAAI/bge-reranker-v2-m3", "Alibaba-NLP/gte-multilingual-reranker-base", #"BAAI/bge-reranker-large", "jinaai/jina-reranker-v2-base-multilingual", "mixedbread-ai/mxbai-rerank-large-v1", #"BAAI/bge-reranker-v2-gemma", #"BAAI/bge-reranker-v2-minicpm-layerwise" ] pdfs_to_process = [ "politique-hse-2024-2026-vf.pdf", "SCP-3194_by_SCP-3194-rMxNQ8yJ.pdf", "2407.12327v1.pdf", ] #cuts off data a lot to speed up process to be sure everything works DEBUG = False import tqdm import random import os import torch import time import openpyxl import traceback import time import fitz # install using: pip install PyMuPDF with fitz.open("") as doc: text = "" for page in doc: text += page.get_text() chunks_path = "./chunks/" pdfs_path = "./pdfs/" if not os.path.exists(pdfs_path): os.makedirs(pdfs_path) if not os.path.exists(chunks_path): os.makedirs(chunks_path) chunk_size = 2048 chunk_size_path = os.path.join(chunks_path, str(chunk_size)) if not os.path.exists(chunk_size_path): os.makedirs(chunk_size_path) # we don't care about mixing the chunks from different pdfs text = "" for pdf in pdfs_to_process: full_path = os.path.join(pdfs_path, pdf) with fitz.open(full_path) as doc: for page in doc: text += page.get_text() chunks = [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)] for chunk_index, chunk in enumerate(chunks): chunk_path = os.path.join(chunk_size_path, f"{chunk_index}.txt") with open(chunk_path, "w", encoding="utf-8") as f: f.write(chunk) needle_question_answers_pairs_english = [ { "question": "Who is the owner of the Rablugorda ?", "needles": [ "Little did he know, the owner of the Rablugorda was none other than the king himself.", "The previous owner of the Rablugorda was a spanish farmer.", "L'ancien propriétaire du Rablugorda était un fermier espagnol.", "El propietario anterior del Rablugorda era un granjero español.", "Die vorherige Besitzerin des Rablugorda war eine spanische Bäuerin.", ], "answer": "The owner of the Rablugorda is the king.", }, { "question": "When was Peter Donkey Born ?", "needles": [ "Peter Donkey was born in november in 1996", "P. Donkey was born in 1996", "Peter Donkey est né en novembre 1996", "Peter Donkey ese nacio en 1996", ], "answer": ["Peter Donkey was born in november in 1996 in a farm in France"], }, { "question": "What is the height of Mount Everest?", "needles": [ "Mount Everest measures 8,848 meters above sea level.", "The tallest mountain is 8,848 meters high.", "La montagne la plus haute mesure 8 848 mètres, c'est l'Everest.", "La montaña más alta mide 8,848 metros.", "Der höchste Berg der Welt ist 8.848 Meter hoch.", ], "answer": "Mount Everest is 8,848 meters tall.", }, { "question": "Who invented the telephone?", "needles": [ "Alexander Graham Bell is credited with the invention of the telephone.", "The telephone was first patented by Bell in 1876.", "Le téléphone a été inventé par Alexander Graham Bell.", "El teléfono fue inventado por Alexander Graham Bell.", "Das Telefon wurde von Alexander Graham Bell erfunden.", ], "answer": "Alexander Graham Bell invented the telephone.", }, { "question": "When did the Titanic sink?", "needles": [ "The Titanic sank in 1912 after hitting an iceberg.", "Over a century ago, in 1912, the Titanic met its fate.", "Le Titanic a coulé en 1912.", "El Titanic se hundió en 1912.", "Die Titanic sank 1912.", ], "answer": "The Titanic sank in 1912.", }, { "question": "What is the capital of Japan?", "needles": [ "Tokyo has been the capital of Japan since 1868.", "The bustling city of Tokyo is the capital of Japan.", "La capitale du Japon est Tokyo.", "La capital de Japón es Tokio.", "Die Hauptstadt Japans ist Tokio.", ], "answer": "The capital of Japan is Tokyo.", }, { "question": "Who wrote 'Macbeth'?", "needles": [ "'Macbeth' was written by William Shakespeare.", "The play 'Macbeth' is one of Shakespeare's famous tragedies.", "'Macbeth' a été écrit par William Shakespeare.", "'Macbeth' fue escrito por William Shakespeare.", "'Macbeth' wurde von William Shakespeare geschrieben.", ], "answer": "William Shakespeare wrote 'Macbeth'.", }, { "question": "What is the chemical symbol for water?", "needles": [ "H2O is the formula for water.", "Water is often referred to by its chemical symbol, H2O.", "La formule chimique de l'eau est H2O.", "El símbolo químico del agua es H2O.", "Die chemische Formel für Wasser ist H2O.", ], "answer": "The chemical symbol for water is H2O.", }, { "question": "Who painted the Mona Lisa?", "needles": [ "The Mona Lisa was painted by Leonardo da Vinci.", "Leonardo da Vinci created the iconic Mona Lisa.", "La Joconde a été peinte par Léonard de Vinci.", "La Mona Lisa fue pintada por Leonardo da Vinci.", "Die Mona Lisa wurde von Leonardo da Vinci gemalt.", ], "answer": "Leonardo da Vinci painted the Mona Lisa.", }, { "question": "What year did World War II end?", "needles": [ "World War II ended in 1945.", "The global conflict concluded in 1945.", "La Seconde Guerre mondiale s'est terminée en 1945.", "La Segunda Guerra Mundial terminó en 1945.", "Der Zweite Weltkrieg endete 1945.", ], "answer": "World War II ended in 1945.", }, { "question": "What is the speed of light?", "needles": [ "The speed of light is approximately 299,792 kilometers per second.", "Light travels at 299,792 km/s.", "La vitesse de la lumière est de 299 792 km/s.", "La velocidad de la luz es de aproximadamente 299,792 kilómetros por segundo.", "Die Lichtgeschwindigkeit beträgt etwa 299.792 Kilometer pro Sekunde.", ], "answer": "The speed of light is 299,792 km/s.", }, { "question": "Who discovered penicillin?", "needles": [ "Penicillin was discovered by Alexander Fleming.", "The discovery of penicillin in 1928 was by Alexander Fleming.", "La pénicilline a été découverte par Alexander Fleming.", "La penicilina fue descubierta por Alexander Fleming.", "Penicillin wurde von Alexander Fleming entdeckt.", ], "answer": "Alexander Fleming discovered penicillin.", }, { "question": "What is the largest ocean on Earth?", "needles": [ "The Pacific Ocean is the largest ocean.", "Earth's largest ocean is the Pacific.", "Le plus grand océan sur Terre est l'océan Pacifique.", "El océano más grande de la Tierra es el Pacífico.", "Der größte Ozean der Erde ist der Pazifik.", ], "answer": "The largest ocean on Earth is the Pacific Ocean.", }, { "question": "When was the Declaration of Independence signed?", "needles": [ "The Declaration of Independence was signed in 1776.", "The foundational document was signed in 1776.", "In 1776, the foundational document was signed.", "La Déclaration d'indépendance a été signée en 1776.", "La Declaración de Independencia fue firmada en 1776.", "Die Unabhängigkeitserklärung wurde 1776 unterzeichnet.", ], "answer": "The Declaration of Independence was signed in 1776.", }, { "question": "Who developed the theory of relativity?", "needles": [ "Einstein developed the theory of relativity.", "The theory of relativity was formulated by Albert Einstein.", "Albert Einstein a développé la théorie de la relativité.", "Albert Einstein desarrolló la teoría de la relatividad.", "Albert Einstein entwickelte die Relativitätstheorie.", ], "answer": "Albert Einstein developed the theory of relativity.", }, { "question": "What is the boiling point of water?", "needles": [ "Water boils at 100 degrees Celsius.", "The boiling point of water is 100°C.", "L'eau bout à 100 degrés Celsius.", "El punto de ebullición del agua es 100 grados Celsius.", "Wasser kocht bei 100 Grad Celsius.", ], "answer": "The boiling point of water is 100 degrees Celsius.", }, { "question": "Who is the CEO of Tesla?", "needles": [ "Elon Musk is the CEO of Tesla.", "Tesla is led by CEO Elon Musk.", "Elon Musk est le PDG de Tesla.", "Elon Musk es el CEO de Tesla.", "Elon Musk ist der CEO von Tesla." ], "answer": "Elon Musk is the CEO of Tesla.", }, { "question": "What is the main ingredient in guacamole?", "needles": [ "Avocado is the main ingredient in guacamole.", "Guacamole is primarily made from avocados.", "L'ingrédient principal du guacamole est l'avocat.", "El ingrediente principal del guacamole es el aguacate.", "Die Hauptzutat in Guacamole ist Avocado.", ], "answer": "The main ingredient in guacamole is avocado.", }, { "question": "What is the hardest natural substance on Earth?", "needles": [ "Diamond is the hardest natural substance on Earth.", "The hardest natural material is diamond.", ], "answer": "The hardest natural substance on Earth is diamond.", }, { "question": "Who wrote 'Pride and Prejudice'?", "needles": [ "'Pride and Prejudice' was written by Jane Austen.", "Jane Austen authored the novel 'Pride and Prejudice'.", ], "answer": "Jane Austen wrote 'Pride and Prejudice'.", }, { "question": "What is the currency of the United Kingdom?", "needles": [ "The British Pound is the currency of the United Kingdom.", "In the UK, the currency used is the Pound Sterling.", ], "answer": "The currency of the United Kingdom is the Pound Sterling.", }, { "question": "What planet is known as the Red Planet?", "needles": [ "Mars is known as the Red Planet.", "The planet with a reddish appearance is Mars.", ], "answer": "Mars is known as the Red Planet.", }, { "question": "What is the formula for sodium chloride?", "needles": [ "Sodium chloride is represented by NaCl.", "The chemical formula for table salt (sodium chloride) is NaCl.", ], "answer": "The formula for sodium chloride is NaCl.", }, { "question": "Who is the author of 'The Catcher in the Rye'?", "needles": [ "'The Catcher in the Rye' was written by J.D. Salinger.", "J.D. Salinger authored the novel 'The Catcher in the Rye'.", ], "answer": "J.D. Salinger is the author of 'The Catcher in the Rye'.", }, { "question": "What is the largest planet in our solar system?", "needles": [ "Jupiter is the largest planet in our solar system.", "The biggest planet is Jupiter.", ], "answer": "The largest planet in our solar system is Jupiter.", }, { "question": "What is the main ingredient in traditional French baguettes?", "needles": [ "Flour is the main ingredient in traditional French baguettes.", "Traditional French baguettes are primarily made from flour.", ], "answer": "The main ingredient in traditional French baguettes is flour.", }, { "question": "Who was the first person to walk on the moon?", "needles": [ "Neil Armstrong was the first person to walk on the moon.", "The first moonwalker was Neil Armstrong in 1969.", ], "answer": "Neil Armstrong was the first person to walk on the moon.", }, { "question": "What causes seasons to change on Earth?", "needles": [ "Seasons change due to the tilt of Earth's axis.", "The tilting of Earth's axis leads to seasonal changes.", ], "answer": "The tilt of Earth's axis causes the seasons to change.", }, { "question": "What is the smallest bone in the human body?", "needles": [ "The stapes is the smallest bone in the human body.", "In the human body, the tiniest bone is the stapes in the ear.", ], "answer": "The smallest bone in the human body is the stapes.", }, { "question": "What is the capital of Australia?", "needles": [ "Canberra is the capital of Australia.", "Australia's capital city is Canberra.", ], "answer": "The capital of Australia is Canberra.", }, { "question": "Who was the first woman to win a Nobel Prize?", "needles": [ "Marie Curie was the first woman to win a Nobel Prize.", "The first female Nobel laureate was Marie Curie.", ], "answer": "Marie Curie was the first woman to win a Nobel Prize.", }, { "question": "What gas do plants absorb from the atmosphere for photosynthesis?", "needles": [ "Plants absorb carbon dioxide for photosynthesis.", "For photosynthesis, plants take in carbon dioxide.", ], "answer": "Plants absorb carbon dioxide from the atmosphere for photosynthesis.", }, { "question": "What is the longest river in the world?", "needles": [ "The Nile River is the longest river in the world.", "The world's longest river, the Nile, stretches over 6,650 kilometers.", ], "answer": "The longest river in the world is the Nile River.", }, ] needle_question_answers_pairs_french = [ { "question": "Qui a écrit 'Les Misérables'?", "needles": [ "'Les Misérables' a été écrit par Victor Hugo.", "Victor Hugo est l'auteur du célèbre roman 'Les Misérables'.", ], "answer": "Victor Hugo a écrit 'Les Misérables'.", }, { "question": "Quelle est la capitale de la France?", "needles": [ "Paris est la capitale de la France.", "La France a pour capitale la ville de Paris.", ], "answer": "La capitale de la France est Paris.", }, { "question": "Quand a eu lieu la Révolution française?", "needles": [ "La Révolution française a commencé en 1789.", "En 1789 débute la Révolution française.", ], "answer": "La Révolution française a commencé en 1789.", }, { "question": "Qui a inventé le cinéma?", "needles": [ "Les frères Lumière ont inventé le cinéma.", "Le cinéma a été inventé par les frères Lumière à la fin du XIXe siècle.", ], "answer": "Les frères Lumière ont inventé le cinéma.", }, { "question": "Quel est le fleuve le plus long de France?", "needles": [ "La Loire est le plus long fleuve de France.", "Le fleuve le plus long en France est la Loire.", ], "answer": "Le fleuve le plus long de France est la Loire.", }, { "question": "Qui est le président de la France en 2024?", "needles": [ "Le président de la France en 2024 est Emmanuel Macron.", "Emmanuel Macron est président de la France en 2024.", ], "answer": "Le président de la France en 2024 est Emmanuel Macron.", }, { "question": "Qu'est-ce que la baguette?", "needles": [ "La baguette est un type de pain français.", "La baguette, pain traditionnel français, est longue et mince.", ], "answer": "La baguette est un type de pain français.", }, { "question": "Quel est le monument le plus visité de Paris?", "needles": [ "Le monument le plus visité de Paris est la Tour Eiffel.", "La Tour Eiffel est le site le plus fréquenté de Paris.", ], "answer": "Le monument le plus visité de Paris est la Tour Eiffel.", }, { "question": "Quand a été construit le château de Versailles?", "needles": [ "Le château de Versailles a été construit au XVIIe siècle.", "La construction du château de Versailles a débuté au 17ème siècle.", ], "answer": "Le château de Versailles a été construit au XVIIe siècle.", }, { "question": "Quel est le fromage le plus célèbre de France?", "needles": [ "Le Camembert est le fromage le plus célèbre de France.", "En France, le Camembert est extrêmement renommé.", ], "answer": "Le Camembert est le fromage le plus célèbre de France.", }, { "question": "Qui a fondé la ville de Québec?", "needles": [ "Samuel de Champlain a fondé la ville de Québec en 1608.", "La ville de Québec a été établie par Samuel de Champlain.", ], "answer": "Samuel de Champlain a fondé la ville de Québec.", }, { "question": "Quand a été publié le premier volume de 'À la recherche du temps perdu'?", "needles": [ "Le premier volume de 'À la recherche du temps perdu' a été publié en 1913.", "En 1913 sort le premier volume de l'œuvre de Proust.", ], "answer": "Le premier volume de 'À la recherche du temps perdu' a été publié en 1913.", }, { "question": "Quelle est la plus haute montagne de France?", "needles": [ "Le Mont Blanc est la plus haute montagne de France.", "En France, le sommet le plus élevé est le Mont Blanc.", ], "answer": "Le Mont Blanc est la plus haute montagne de France.", }, { "question": "Quel est l'océan le plus proche de la France?", "needles": [ "L'océan Atlantique est l'océan le plus proche de la France.", "La France est bordée à l'ouest par l'océan Atlantique.", ], "answer": "L'océan Atlantique est l'océan le plus proche de la France.", }, { "question": "Qui est le créateur de la statue de la Liberté?", "needles": [ "Frédéric Auguste Bartholdi est le créateur de la statue de la Liberté.", "La statue de la Liberté a été conçue par Bartholdi.", ], "answer": "Frédéric Auguste Bartholdi est le créateur de la statue de la Liberté.", }, { "question": "Quelle est la durée du mandat présidentiel en France?", "needles": [ "Le mandat présidentiel en France est de cinq ans.", "En France, le président est élu pour un mandat de cinq ans.", ], "answer": "Le mandat présidentiel en France est de cinq ans.", }, { "question": "Quel événement historique français est associé à la prise de la Bastille?", "needles": [ "La prise de la Bastille est associée à la Révolution française.", "Le 14 juillet 1789, la prise de la Bastille marque le début de la Révolution française.", ], "answer": "La prise de la Bastille est associée à la Révolution française.", }, { "question": "Qui a gagné la Coupe du Monde de football en 1998?", "needles": [ "La France a gagné la Coupe du Monde de football en 1998.", "En 1998, l'équipe de France de football remporte la Coupe du Monde.", ], "answer": "La France a gagné la Coupe du Monde de football en 1998.", }, { "question": "Quel est le plus grand musée d'art en France?", "needles": [ "Le Louvre est le plus grand musée d'art en France.", "Situé à Paris, le Louvre est le musée d'art le plus vaste de France.", ], "answer": "Le Louvre est le plus grand musée d'art en France.", }, { "question": "Quand le français est-il devenu la langue officielle de la France?", "needles": [ "Le français est devenu la langue officielle de la France au 16ème siècle.", "Au cours du 16ème siècle, le français s'établit comme la langue officielle.", ], "answer": "Le français est devenu la langue officielle de la France au 16ème siècle.", }, ] needle_question_answers_pairs_german = [ { "question": "Wer hat das Periodensystem entwickelt?", "needles": [ "Dmitri Mendelejew hat das Periodensystem entwickelt.", "Das Periodensystem wurde von Dmitri Mendelejew erschaffen.", ], "answer": "Dmitri Mendelejew hat das Periodensystem entwickelt.", }, { "question": "Was ist die Hauptstadt von Deutschland?", "needles": [ "Berlin ist die Hauptstadt von Deutschland.", "Die Hauptstadt Deutschlands ist Berlin.", ], "answer": "Die Hauptstadt von Deutschland ist Berlin.", }, { "question": "Wann fiel die Berliner Mauer?", "needles": [ "Die Berliner Mauer fiel 1989.", "1989 markierte das Ende der Berliner Mauer.", ], "answer": "Die Berliner Mauer fiel 1989.", }, { "question": "Wer hat den Buchdruck erfunden?", "needles": [ "Johannes Gutenberg erfand den Buchdruck.", "Der Buchdruck wurde von Johannes Gutenberg im 15. Jahrhundert erfunden.", ], "answer": "Johannes Gutenberg hat den Buchdruck erfunden.", }, { "question": "Welcher Fluss ist der längste in Deutschland?", "needles": [ "Der Rhein ist der längste Fluss in Deutschland.", "In Deutschland ist der Rhein der längste Fluss.", ], "answer": "Der längste Fluss in Deutschland ist der Rhein.", }, { "question": "Wer schrieb 'Faust'?", "needles": [ "'Faust' wurde von Johann Wolfgang von Goethe geschrieben.", "Johann Wolfgang von Goethe ist der Autor von 'Faust'.", ], "answer": "Johann Wolfgang von Goethe schrieb 'Faust'.", }, { "question": "Was ist das größte Bierfest der Welt?", "needles": [ "Das Oktoberfest ist das größte Bierfest der Welt.", "In München findet das weltweit größte Bierfest, das Oktoberfest, statt.", ], "answer": "Das größte Bierfest der Welt ist das Oktoberfest.", }, { "question": "Wann wurde die Bundesrepublik Deutschland gegründet?", "needles": [ "Die Bundesrepublik Deutschland wurde 1949 gegründet.", "1949 war das Gründungsjahr der Bundesrepublik Deutschland.", ], "answer": "Die Bundesrepublik Deutschland wurde 1949 gegründet.", }, { "question": "Welches Schloss ist das größte in Deutschland?", "needles": [ "Schloss Neuschwanstein ist das größte Schloss in Deutschland.", "Das berühmte Schloss Neuschwanstein gilt als das größte in Deutschland.", ], "answer": "Das größte Schloss in Deutschland ist Schloss Neuschwanstein.", }, { "question": "Wer ist der bekannteste Komponist Deutschlands?", "needles": [ "Ludwig van Beethoven ist der bekannteste Komponist Deutschlands.", "Beethoven gilt als der berühmteste deutsche Komponist.", ], "answer": "Der bekannteste Komponist Deutschlands ist Ludwig van Beethoven.", }, { "question": "Wer war der erste Bundeskanzler der Bundesrepublik Deutschland?", "needles": [ "Konrad Adenauer war der erste Bundeskanzler der Bundesrepublik Deutschland.", "Der erste Bundeskanzler nach Gründung der Bundesrepublik war Konrad Adenauer.", ], "answer": "Konrad Adenauer war der erste Bundeskanzler der Bundesrepublik Deutschland.", }, { "question": "Was ist das traditionelle Gericht in Bayern?", "needles": [ "Das traditionelle Gericht in Bayern ist Weißwurst.", "In Bayern sind Weißwürste ein traditionelles Gericht.", ], "answer": "Das traditionelle Gericht in Bayern ist Weißwurst.", }, { "question": "Welcher Fluss fließt durch Berlin?", "needles": [ "Die Spree fließt durch Berlin.", "Berlin wird von der Spree durchquert.", ], "answer": "Die Spree fließt durch Berlin.", }, { "question": "Was ist das Grundgesetz?", "needles": [ "Das Grundgesetz ist die Verfassung der Bundesrepublik Deutschland.", "In Deutschland fungiert das Grundgesetz als die Verfassung.", ], "answer": "Das Grundgesetz ist die Verfassung der Bundesrepublik Deutschland.", }, { "question": "Wann wurde Berlin wieder zur Hauptstadt Deutschlands?", "needles": [ "Berlin wurde 1990 wieder zur Hauptstadt Deutschlands.", "Nach der Wiedervereinigung wurde Berlin 1990 erneut Hauptstadt.", ], "answer": "Berlin wurde 1990 wieder zur Hauptstadt Deutschlands.", }, { "question": "Wer war Ludwig van Beethoven?", "needles": [ "Ludwig van Beethoven war ein deutscher Komponist der klassischen Musik.", "Beethoven, ein berühmter deutscher Komponist, prägte die klassische Musik tiefgehend.", ], "answer": "Ludwig van Beethoven war ein deutscher Komponist.", }, { "question": "Was ist der Schwarzwald?", "needles": [ "Der Schwarzwald ist ein großes Waldgebiet in Deutschland.", "In Süddeutschland liegt der bekannte Schwarzwald.", ], "answer": "Der Schwarzwald ist ein großes Waldgebiet in Deutschland.", }, { "question": "Welches Bauwerk ist bekannt als das deutsche Tor zur Welt?", "needles": [ "Der Hamburger Hafen ist bekannt als das deutsche Tor zur Welt.", "In Deutschland gilt der Hamburger Hafen als Tor zur Welt.", ], "answer": "Der Hamburger Hafen ist bekannt als das deutsche Tor zur Welt.", }, { "question": "Wer schrieb 'Die Blechtrommel'?", "needles": [ "'Die Blechtrommel' wurde von Günter Grass geschrieben.", "Günter Grass ist der Autor des Romans 'Die Blechtrommel'.", ], "answer": "Günter Grass schrieb 'Die Blechtrommel'.", }, { "question": "Was ist das größte Tier in Deutschland?", "needles": [ "Der Elch ist das größte in Deutschland lebende Tier.", "In den deutschen Wäldern ist der Elch das größte Tier.", ], "answer": "Der Elch ist das größte Tier in Deutschland.", }, ] needle_question_answers_pairs_spanish = [ { "question": "¿Quién pintó 'Guernica'?", "needles": [ "'Guernica' fue pintado por Pablo Picasso.", "Pablo Picasso es el autor del famoso cuadro 'Guernica'.", ], "answer": "Pablo Picasso pintó 'Guernica'.", }, { "question": "¿Cuál es la capital de España?", "needles": [ "Madrid es la capital de España.", "La capital de España es Madrid.", ], "answer": "La capital de España es Madrid.", }, { "question": "¿Cuándo comenzó la Guerra Civil Española?", "needles": [ "La Guerra Civil Española comenzó en 1936.", "En 1936, estalló la Guerra Civil Española.", ], "answer": "La Guerra Civil Española comenzó en 1936.", }, { "question": "¿Quién escribió 'Don Quijote de la Mancha'?", "needles": [ "'Don Quijote de la Mancha' fue escrito por Miguel de Cervantes.", "Miguel de Cervantes es el autor de 'Don Quijote'.", ], "answer": "Miguel de Cervantes escribió 'Don Quijote de la Mancha'.", }, { "question": "¿Cuál es el río más largo de España?", "needles": [ "El río Ebro es el más largo de España.", "En España, el río más largo es el Ebro.", ], "answer": "El río más largo de España es el Ebro.", }, { "question": "¿Quién es el presidente de España en 2024?", "needles": [ "El presidente de España en 2024 es Pedro Sánchez.", "En 2024, Pedro Sánchez es el presidente de España.", ], "answer": "El presidente de España en 2024 es Pedro Sánchez.", }, { "question": "¿Qué es la paella?", "needles": [ "La paella es un plato tradicional español.", "El plato típico español, la paella, contiene arroz, mariscos y safrán.", ], "answer": "La paella es un plato tradicional español.", }, { "question": "¿Qué monumento es el más visitado en España?", "needles": [ "La Sagrada Familia es el monumento más visitado de España.", "En Barcelona, la Sagrada Familia atrae a más visitantes que cualquier otro sitio.", ], "answer": "La Sagrada Familia es el monumento más visitado en España.", }, { "question": "¿Cuándo fue construida la Alhambra?", "needles": [ "La Alhambra fue construida en el siglo XIV.", "En el siglo XIV se erigió la Alhambra en Granada.", ], "answer": "La Alhambra fue construida en el siglo XIV.", }, { "question": "¿Cuál es el queso más famoso de España?", "needles": [ "El Manchego es el queso más famoso de España.", "En España, el queso Manchego es ampliamente conocido y apreciado.", ], "answer": "El queso más famoso de España es el Manchego.", }, { "question": "¿Quién fue el primer rey de España?", "needles": [ "Don Pelayo es considerado el primer rey de España.", "El primer rey de lo que hoy es España fue Don Pelayo.", ], "answer": "Don Pelayo fue el primer rey de España.", }, { "question": "¿Cuál es el deporte más popular en España?", "needles": [ "El fútbol es el deporte más popular en España.", "En España, el deporte dominante y más seguido es el fútbol.", ], "answer": "El deporte más popular en España es el fútbol.", }, { "question": "¿Qué es el Camino de Santiago?", "needles": [ "El Camino de Santiago es una ruta de peregrinación en España.", "Muchos peregrinos recorren el Camino de Santiago cada año.", ], "answer": "El Camino de Santiago es una ruta de peregrinación.", }, { "question": "¿Dónde se encuentra la Alhambra?", "needles": [ "La Alhambra está ubicada en Granada.", "En la ciudad de Granada se encuentra el palacio de la Alhambra.", ], "answer": "La Alhambra se encuentra en Granada.", }, { "question": "¿Quién es el autor de 'Cien años de soledad'?", "needles": [ "Gabriel García Márquez escribió 'Cien años de soledad'.", "El colombiano Gabriel García Márquez es el autor de la novela 'Cien años de soledad'.", ], "answer": "Gabriel García Márquez es el autor de 'Cien años de soledad'.", }, { "question": "¿Cuál es el pico más alto de España?", "needles": [ "El Teide es el pico más alto de España.", "En las Islas Canarias, El Teide se erige como el pico más alto.", ], "answer": "El Teide es el pico más alto de España.", }, { "question": "¿Cuándo se restauró la democracia en España?", "needles": [ "La democracia fue restaurada en España en 1978.", "En 1978, con la aprobación de la nueva Constitución, se restauró la democracia en España.", ], "answer": "La democracia en España fue restaurada en 1978.", }, { "question": "¿Qué es una tapa en la gastronomía española?", "needles": [ "Una tapa es un pequeño aperitivo que se sirve en los bares de España.", "Las tapas son aperitivos típicos en España, servidos en pequeñas porciones.", ], "answer": "Una tapa es un pequeño aperitivo en España.", }, { "question": "¿Quién pintó 'Las Meninas'?", "needles": [ "'Las Meninas' fue pintado por Diego Velázquez.", "Diego Velázquez es el autor de la obra maestra 'Las Meninas'.", ], "answer": "Diego Velázquez pintó 'Las Meninas'.", }, { "question": "¿Cuál es la fiesta más famosa de España?", "needles": [ "Los Sanfermines son la fiesta más famosa de España.", "En Pamplona, los Sanfermines atraen a visitantes de todo el mundo.", ], "answer": "Los Sanfermines son la fiesta más famosa de España.", }, ] if DEBUG is True: needle_question_answers_pairs_english = needle_question_answers_pairs_english[:2] needle_question_answers_pairs_french = needle_question_answers_pairs_french[:2] needle_question_answers_pairs_german = needle_question_answers_pairs_german[:2] needle_question_answers_pairs_spanish = needle_question_answers_pairs_spanish[:2] import copy original_chunks = copy.deepcopy(chunks) # for the 'Average index' collumn (beside the header cell) add a style, remap value (int) to color 0->original_chunks to green->red langs = ["english", "french", "german", "spanish"] xlsx_name = f"results_{time.time()}.xlsx" def inject_needle_in_text(needle_text: str, text: str) -> str: random_index = random.randint(0, len(text)) if random_index == 0: return needle_text + text if random_index == len(text): return text + needle_text return text[:random_index] + needle_text + text[random_index:] def inject_needle_in_chunks(needle_question_answer, chunks) -> Tuple[List[str], List[int]]: cpy_chunks = copy.deepcopy(chunks) n_chunks = len(cpy_chunks) random_indexes = [ random.randint(0, n_chunks - 1) for _ in range(len(needle_question_answer["needles"])) ] needle_question_answer["expected_matches"] = random_indexes for needle_index, needle_text in enumerate(needle_question_answer["needles"]): cpy_chunks[random_indexes[needle_index]] = inject_needle_in_text( needle_text, cpy_chunks[random_indexes[needle_index]] ) return cpy_chunks, random_indexes # Initialize the workbook wb = openpyxl.Workbook() worksheet = wb.active cols = [ "Model", "English average index", "French average index", "German average index", "Spanish average index", "Average index", "Total chunks", "Time spent(s)", "VRAM used (MB)" ] worksheet.append(cols) import time for reranker_model in embeddings_models: gc.collect() torch.cuda.empty_cache() try: if reranker_model == "BAAI/bge-reranker-v2-minicpm-layerwise" or reranker_model == "BAAI/bge-reranker-v2-gemma": model = LayerWiseFlagLLMReranker(reranker_model, use_fp16=True) else: model = CrossEncoder( reranker_model, trust_remote_code=True, device="cuda", automodel_args={"torch_dtype": "auto"}, ) # Cache embeddings for the original chunks _time = time.time() languages_averages = {} for idx_pair_set, needle_question_answers_pairs in enumerate(tqdm.tqdm( [needle_question_answers_pairs_english, needle_question_answers_pairs_french, needle_question_answers_pairs_german, needle_question_answers_pairs_spanish], position=0) ): stored_scored = [] for pair in tqdm.tqdm(needle_question_answers_pairs, position=1): untouched_chunks = copy.deepcopy(original_chunks) modified_chunks, ground_truth_indexes = inject_needle_in_chunks(pair, untouched_chunks) # Calculate similarity between question and updated chunk embeddings question = pair["question"] documents = modified_chunks if reranker_model == "BAAI/bge-reranker-v2-minicpm-layerwise" or reranker_model == "BAAI/bge-reranker-v2-gemma": pairs = [[question, doc] for doc in documents] if reranker_model == "BAAI/bge-reranker-v2-minicpm-layerwise": results = model.compute_score(pairs,cutoff_layers=[28])[0] else: results = model.compute_score(pairs)[0] sorted_indexes_results = np.argsort(results)[::-1] sorted_indexes_results_list = sorted_indexes_results.tolist() for expected_index in ground_truth_indexes: stored_scored.append(sorted_indexes_results_list.index(expected_index)) else: results = model.rank(question, documents, return_documents=False, convert_to_tensor=True) results_indexes_only = [result["corpus_id"] for result in results] for expected_index in ground_truth_indexes: ranked_index = results_indexes_only.index(expected_index) stored_scored.append(ranked_index) gc.collect() average_index = sum(stored_scored) / len(stored_scored) languages_averages[langs[idx_pair_set]] = average_index time_spent = time.time() - _time vram_used = torch.cuda.memory_allocated() / 1024 / 1024 worksheet.append([ reranker_model, int(languages_averages["english"]*10)/10, int(languages_averages["french"]*10)/10, int(languages_averages["german"]*10)/10, int(languages_averages["spanish"]*10)/10, int((sum(languages_averages.values()) / 4)*10)/10, len(original_chunks), time_spent, vram_used, ]) del model except Exception as e: traceback.print_exc() print(f"Error: {e}") worksheet.append([reranker_model, "Error", "Error", "Error", "Error", "Error", len(original_chunks), 0, 0]) gc.collect() torch.cuda.empty_cache() max_value = len(original_chunks) def interpolate_color(color1, color2, value, max_value): ratio = value / max_value r = int(color1[0] + (color2[0] - color1[0]) * ratio) g = int(color1[1] + (color2[1] - color1[1]) * ratio) b = int(color1[2] + (color2[2] - color1[2]) * ratio) return f"FF{r:02X}{g:02X}{b:02X}" def get_color(value, max_value): green = (0, 255, 0) yellow = (255, 255, 0) orange = (255, 165, 0) red = (255, 0, 0) if value <= max_value / 4: return interpolate_color(green, yellow, value, max_value / 4) elif value <= max_value / 2: return interpolate_color(yellow, orange, value - max_value / 4, max_value / 4) elif value <= 3 * max_value / 4: return interpolate_color(orange, red, value - max_value / 2, max_value / 4) else: return interpolate_color( red, (0, 0, 0), value - 3 * max_value / 4, max_value / 4 ) wb.save(xlsx_name) for row in range(2, 2 + len(embeddings_models)): for col in range(2, 7): cell = worksheet.cell(row=row, column=col) value = float(cell.value if cell.value != "Error" else max_value) color = get_color(value, max_value) cell.fill = openpyxl.styles.PatternFill( start_color=color, end_color=color, fill_type="solid" ) #add bold to col 8 cell = worksheet.cell(row=row, column=6) cell.font = openpyxl.styles.Font(bold=True) wb.save(xlsx_name)