fetchPDFs.py

from pdf2image import convert_from_bytes
import pytesseract
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import re
import random
from utilities import *
import re
from PIL import Image
import random
from datetime import datetime
from clean_data import *

def extract_red_text_directly(page_image):

    # Convert the image to a NumPy array
    img_array = np.array(page_image)

    # Create a copy of the image and apply the background color to non-red content
    modified_img_array = img_array.copy()
    non_red_mask = ~((img_array[:, :, 0] > 90) & (img_array[:, :, 2] < 90))
    modified_img_array[non_red_mask] = [252, 250, 249]

    # Convert back to an image
    modified_image = Image.fromarray(modified_img_array.astype(np.uint8))

    # Perform OCR directly on the cropped region
    text = pytesseract.image_to_string(
        modified_image, config="--psm 6 --oem 1"
    )

    if text.startswith("("):
        return f"{text.strip()}"
    
    return f" {text.strip()}"

def fetch_pdf_content(url, latest_date):
    pdfLink = 'https://www.oni.navy.mil/ONI-Reports/Shipping-Threat-Reports/Worldwide-Threat-to-Shipping-Report-Archive/FileId/'
    
    def isValidDate(date_str):
        try:
            date = datetime.strptime(date_str, "%Y%m%d")
            if date > latest_date:
                return True
            return False
        except ValueError:
            return False
    
    while True:
        try:
            response = requests.get(url, timeout=75)

            response.encoding = 'utf-8'
            if response and response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')

                fileElements = soup.select("div.ModDNNLinksC select.NormalTextBox option")
                fileElements = [fileElement for fileElement in fileElements if isValidDate(fileElement.text.split("_")[0][:8])]
                
                global fileDates
                fileDates = [fileElement.text.split("_")[0][:8] for fileElement in fileElements]

                fileContents = []
                for fileElement in fileElements:
                    FileId = fileElement['value']
                    while True:
                        try:
                            response = requests.get(pdfLink + FileId, timeout=75)
                            response.raise_for_status()
                            fileContents.append(response.content)
                            break
                        except Exception as e:
                            print(f"{e}")
                            time.sleep(random.randint(10, 15))
                return fileContents
        except Exception as e:
            print(f"{e}")
            time.sleep(random.randint(10, 15))

def convert_to_decimal(coord):
    """
    Convert latitude or longitude in DMS format (e.g., 04:53N) to decimal degrees.
    """
    # Split degrees, minutes, and direction
    match = re.match(r'(\d{2,3}):(\d{2})([NSWE])', coord)
    if not match:
        raise ValueError(f"Invalid coordinate format: {coord}")
    
    degrees = int(match.group(1))
    minutes = int(match.group(2))
    direction = match.group(3)

    # Calculate decimal degrees
    decimal = degrees + minutes / 60.0

    # Apply negative sign for south or west
    if direction in ['S', 'W']:
        decimal = -decimal

    return decimal

def fetch_pdfs(latest_date):
    url = 'https://www.oni.navy.mil/ONI-Reports/Shipping-Threat-Reports/Worldwide-Threat-to-Shipping-Report-Archive/'

    # Fetch the PDF content as bytes
    pdf_contents = fetch_pdf_content(url, latest_date)

    print("Successfully Fetched PDF(s)")

    rows = []
    for index, pdf_content in enumerate(pdf_contents):

        # Convert the pages to images
        pages = (convert_from_bytes(pdf_content, dpi=400))

        red_text_results = []
        for page_image in pages:
            red_text_results.append(extract_red_text_directly(page_image))

        print("Successfully extracted text")

        processed_text = "".join(red_text_results).strip().replace("({U)", "(U)")
        paragraphs = [p.replace("\n", " ").strip() for p in processed_text.split("(U)") if p.strip()]

        # Regular expression to match the patterns
        coordinate_pattern = re.compile(r'(\d{2}:\d{2}[NS])\s*—\s*(\d{3}:\d{2}[EW])')
        date_pattern = re.compile(r'\b(\d{1,2})\s+(January|February|March|April|May|June|July|August|September|October|November|December)\b', re.IGNORECASE)

        for paragraph in paragraphs:
            # Extract matches
            match = coordinate_pattern.search(paragraph)
            if not match:
                continue
            
            latitude, longitude = match.groups()
            date = fileDates[index]
            entry_date = f'{date[:4]}-{date[4:6]}-{date[6:8]}'

            # Search for the first occurrence
            match = date_pattern.search(paragraph)
            if match:
                months = ['January','February',' March', 'April', 'May', 'June', 'July', 
                        'August', 'September', 'October', 'November', 'December']
                
                day, month = match.groups()
                year = date[:4]
                month = months.index(month.capitalize()) + 1

                # Construct the date
                entry_date = pd.Timestamp(int(year), month, int(day))

                # Ensure the incident date is NOT after the PDF's publication date
                pdf_date = pd.Timestamp(int(date[:4]), int(date[4:6]), int(date[6:8]))

                if entry_date > pdf_date:
                    entry_date = entry_date.replace(year=entry_date.year - 1)  # Adjust to the previous year

                entry_date = entry_date.strftime('%Y-%m-%d')

            rows.append({ 
                'tracking_number': '',
                'date_of_incident': entry_date,
                'Year': year, 
                'Month': months[month - 1],
                'primary_category': 'Piracy and Robbery',
                'event_type': '',
                'event_subtype': '', 
                'event_associated_number': '',
                'event_associated_number_unit': '',
                'latitude': convert_to_decimal(latitude), 
                'longitude': convert_to_decimal(longitude), 
                'description': paragraph,  
                'actor1': '', 
                'assoc_actor_1': '',
                'actor2': '',
                'assoc_actor_2': '',
                'actor3': '', 
                'assoc_actor_3': '',
                'name_of_ship1': '',
                'imo_number_of_ship1': '',
                'type_of_ship1': '',
                'name_of_ship2': '',
                'imo_number_of_ship2': '',
                'type_of_ship2': '',
                'name_of_ship3': '',
                'imo_number_of_ship3': '',
                'type_of_ship3': '',
                'flag_states_involved': '', 
                'location': '',
                'location_precision': 'Point',
                'location_url': 'https://www.oni.navy.mil/ONI-Reports/Shipping-Threat-Reports/Worldwide-Threat-to-Shipping-Report-Archive/',
                'source_name': 'Office of Naval Intelligence Worldwide Threats to Shipping',
                'source_link_external': 'https://www.oni.navy.mil/ONI-Reports/Shipping-Threat-Reports/Worldwide-Threat-to-Shipping-Report-Archive/',
                'source_name2': '',
                'source_link2_external': '',
                'compiled_from': '',
                'compilation_source_link_external': ''
            })
        
    data = pd.DataFrame(rows)
    data = format_data(data)

    return data