modify-deadlink.py

import os
import sys
import re
from collections import namedtuple

# Define a structure to store information, added the 'sed_str' field
FileInfo = namedtuple('FileInfo', ['target_file', 'url_line', 'url_path', 'url_count', 'relative_url', 'log_error', 'origin_url', 'sed_str'])

def find_file(file_str, search_dir, line_content):
    # Initialize result list
    results = []

    # Extract the second file path (including the line number)
    match = re.search(r"in file '([^']+)'", file_str)
    if match:
        base_file = match.group(1)  # For example: "versioned_docs/version-3.0/sql-manual/sql-data-types/data-type-overview.md:67"
        parts = base_file.split(":")
        base_file_path = parts[0]         # Remove the line number part to get the file path
        line_number = parts[1] if len(parts) > 1 else ""  # The part after the colon

        # Get the root directory of the second file path
        root_dir = os.path.dirname(base_file_path)

        # Extract the first file path based on the 'link' in the log line
        match = re.search(r"link '([^']+)'", file_str)  # Extract the path after 'link'
        if match:
            filename = match.group(1)
            # Get the base file name (remove the path part)
            file_base_name = os.path.basename(filename)
            # Create the target file name, check if it already has a .md extension
            if not file_base_name.endswith(".md"):
                target_filename = f"{file_base_name}.md"
            else:
                target_filename = file_base_name

            # Check if the file exists in the directory and count the number of occurrences
            found_files = []
            for root, dirs, files in os.walk(search_dir):
                if target_filename in files:
                    file_path = os.path.join(root, target_filename)
                    found_files.append(file_path)

            # Store the result in the structure array
            if found_files:
                url_count = 0
                relative_url = ""
                for file in found_files:
                    # Calculate the relative file path
                    url_path = os.path.relpath(file, os.getcwd())
                    url_count += 1

                # If only one URL is found, output the relative path from the file directory
                if url_count == 1:
                    relative_url = os.path.relpath(found_files[0], os.path.dirname(base_file_path))

                    # Handle relative_url, if it doesn't start with '../', prepend './', and remove the .md suffix
                    if not relative_url.startswith("../"):
                        relative_url = "./" + relative_url
                    if relative_url.endswith(".md"):
                        relative_url = relative_url[:-3]

                # Extract the origin_url (from log_error, extracting the path after 'link' in quotes)
                origin_url_match = re.search(r"link '([^']+)'", line_content)  # Find the content following 'link'
                origin_url = origin_url_match.group(1) if origin_url_match else ""

                # Create the sed_str command (valid only when url_count is 1)
                sed_str = ""
                if url_count == 1:
                    sed_str = f"sed -i '{line_number}s|({origin_url})|({relative_url})|' {base_file_path}"

                # Store the result in the structure array
                file_info = FileInfo(
                    target_file=base_file_path,
                    url_line=line_number,
                    url_path=url_path,
                    url_count=url_count,
                    relative_url=relative_url,
                    log_error=line_content,  # Store the current line content
                    origin_url=origin_url,   # Store origin_url
                    sed_str=sed_str          # Store sed command
                )
                results.append(file_info)

            else:
                print(f"[ERR] No file named {target_filename} found in {search_dir}.")
                print(f"[ERR] Error log: {line_content}")  # Output the current error log
                print("-" * 80)  # Print the separator line
        else:
            print(f"No valid file path found in the input string.")
            print(f"Error log: {line_content}")  # Output the current error log
            print("-" * 80)  # Print the separator line
    else:
        print(f"No valid base file path found in the input string.")
        print(f"Error log: {line_content}")  # Output the current error log
        print("-" * 80)  # Print the separator line

    return results

# New function: Read the file and call find_file
def get_deadlink(file_path, search_dir):
    results = []
    if os.path.isfile(file_path):  # Check if it's a valid file
        with open(file_path, 'r') as file:
            for line in file:
                line = line.strip()  # Remove possible spaces and newline characters
                # Call find_file for each line and pass the current line content
                results.extend(find_file(line, search_dir, line))  # Append the result of each line to the results list
    else:
        print(f"{file_path} is not a valid file.")  # Print if the file is invalid

    return results

# Print the results from the structure array
def print_results(results):
    for result in results:
        print(f"[LOG] target_file >> {result.target_file}")
        print(f"[LOG] url_line >> {result.url_line}")
        print(f"[LOG] url_path >> {result.url_path}")
        print(f"[LOG] url_count >> {result.url_count}")
        print(f"[LOG] relative_url >> {result.relative_url}")
        print(f"[LOG] log_error >> {result.log_error}")  # Print log_error
        print(f"[LOG] origin_url >> {result.origin_url}")  # Print origin_url
        print(f"[LOG] sed_str >> {result.sed_str}")  # Print sed_str
        print("----------------------------------------------------------------")

if __name__ == "__main__":
    # Get input arguments
    if len(sys.argv) != 3:
        print("Usage: python find_file.py '<file_with_logs>' <search_dir>")  # Print usage message
        sys.exit(1)

    file_with_logs = sys.argv[1]  # Get the file path
    search_dir = sys.argv[2]  # Get the search directory

    # Process the file and get results
    results = get_deadlink(file_with_logs, search_dir)

    # Print the results from the structure array
    print_results(results)