forked from apache/doris-website
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodify-deadlink.py
140 lines (118 loc) · 6.32 KB
/
modify-deadlink.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import os
import sys
import re
from collections import namedtuple
# Define a structure to store information, added the 'sed_str' field
FileInfo = namedtuple('FileInfo', ['target_file', 'url_line', 'url_path', 'url_count', 'relative_url', 'log_error', 'origin_url', 'sed_str'])
def find_file(file_str, search_dir, line_content):
# Initialize result list
results = []
# Extract the second file path (including the line number)
match = re.search(r"in file '([^']+)'", file_str)
if match:
base_file = match.group(1) # For example: "versioned_docs/version-3.0/sql-manual/sql-data-types/data-type-overview.md:67"
parts = base_file.split(":")
base_file_path = parts[0] # Remove the line number part to get the file path
line_number = parts[1] if len(parts) > 1 else "" # The part after the colon
# Get the root directory of the second file path
root_dir = os.path.dirname(base_file_path)
# Extract the first file path based on the 'link' in the log line
match = re.search(r"link '([^']+)'", file_str) # Extract the path after 'link'
if match:
filename = match.group(1)
# Get the base file name (remove the path part)
file_base_name = os.path.basename(filename)
# Create the target file name, check if it already has a .md extension
if not file_base_name.endswith(".md"):
target_filename = f"{file_base_name}.md"
else:
target_filename = file_base_name
# Check if the file exists in the directory and count the number of occurrences
found_files = []
for root, dirs, files in os.walk(search_dir):
if target_filename in files:
file_path = os.path.join(root, target_filename)
found_files.append(file_path)
# Store the result in the structure array
if found_files:
url_count = 0
relative_url = ""
for file in found_files:
# Calculate the relative file path
url_path = os.path.relpath(file, os.getcwd())
url_count += 1
# If only one URL is found, output the relative path from the file directory
if url_count == 1:
relative_url = os.path.relpath(found_files[0], os.path.dirname(base_file_path))
# Handle relative_url, if it doesn't start with '../', prepend './', and remove the .md suffix
if not relative_url.startswith("../"):
relative_url = "./" + relative_url
if relative_url.endswith(".md"):
relative_url = relative_url[:-3]
# Extract the origin_url (from log_error, extracting the path after 'link' in quotes)
origin_url_match = re.search(r"link '([^']+)'", line_content) # Find the content following 'link'
origin_url = origin_url_match.group(1) if origin_url_match else ""
# Create the sed_str command (valid only when url_count is 1)
sed_str = ""
if url_count == 1:
sed_str = f"sed -i '{line_number}s|({origin_url})|({relative_url})|' {base_file_path}"
# Store the result in the structure array
file_info = FileInfo(
target_file=base_file_path,
url_line=line_number,
url_path=url_path,
url_count=url_count,
relative_url=relative_url,
log_error=line_content, # Store the current line content
origin_url=origin_url, # Store origin_url
sed_str=sed_str # Store sed command
)
results.append(file_info)
else:
print(f"[ERR] No file named {target_filename} found in {search_dir}.")
print(f"[ERR] Error log: {line_content}") # Output the current error log
print("-" * 80) # Print the separator line
else:
print(f"No valid file path found in the input string.")
print(f"Error log: {line_content}") # Output the current error log
print("-" * 80) # Print the separator line
else:
print(f"No valid base file path found in the input string.")
print(f"Error log: {line_content}") # Output the current error log
print("-" * 80) # Print the separator line
return results
# New function: Read the file and call find_file
def get_deadlink(file_path, search_dir):
results = []
if os.path.isfile(file_path): # Check if it's a valid file
with open(file_path, 'r') as file:
for line in file:
line = line.strip() # Remove possible spaces and newline characters
# Call find_file for each line and pass the current line content
results.extend(find_file(line, search_dir, line)) # Append the result of each line to the results list
else:
print(f"{file_path} is not a valid file.") # Print if the file is invalid
return results
# Print the results from the structure array
def print_results(results):
for result in results:
print(f"[LOG] target_file >> {result.target_file}")
print(f"[LOG] url_line >> {result.url_line}")
print(f"[LOG] url_path >> {result.url_path}")
print(f"[LOG] url_count >> {result.url_count}")
print(f"[LOG] relative_url >> {result.relative_url}")
print(f"[LOG] log_error >> {result.log_error}") # Print log_error
print(f"[LOG] origin_url >> {result.origin_url}") # Print origin_url
print(f"[LOG] sed_str >> {result.sed_str}") # Print sed_str
print("----------------------------------------------------------------")
if __name__ == "__main__":
# Get input arguments
if len(sys.argv) != 3:
print("Usage: python find_file.py '<file_with_logs>' <search_dir>") # Print usage message
sys.exit(1)
file_with_logs = sys.argv[1] # Get the file path
search_dir = sys.argv[2] # Get the search directory
# Process the file and get results
results = get_deadlink(file_with_logs, search_dir)
# Print the results from the structure array
print_results(results)