-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgenerate_content_json.py
79 lines (67 loc) · 3.11 KB
/
generate_content_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import json
import markdown
import yaml
import re
def clean_html_content(html_content):
patterns_to_remove = [
r'<div id="chatbot-toggle">.*?</script>', # To remove chatbot HTML and JavaScript snippet
]
for pattern in patterns_to_remove:
html_content = re.sub(pattern, '', html_content, flags=re.DOTALL)
return html_content
def get_content(directory, lang_code):
content = []
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith('.md'):
with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
md_content = f.read()
# Extract frontmatter
frontmatter_match = re.match(r'^---\s*\n(.*?)\n---\s*\n', md_content, re.DOTALL)
if frontmatter_match:
frontmatter = yaml.safe_load(frontmatter_match.group(1))
title = frontmatter.get('title', '')
md_content = md_content[frontmatter_match.end():]
else:
title = ''
# If no title in frontmatter, try to extract from first H1
if not title:
title_match = re.search(r'^#\s+(.+)$', md_content, re.MULTILINE)
if title_match:
title = title_match.group(1)
else:
title = os.path.splitext(file)[0] # Use filename as last resort
# Convert Markdown to HTML
html_content = markdown.markdown(md_content)
html_content = clean_html_content(html_content)
# Generate URL respecting language code
url = '/' + os.path.relpath(os.path.join(root, file), directory).replace('\\', '/')
url = url.replace('.md', '/')
if lang_code == 'ua':
url = f'/ua{url}'
# Adjust URL for '_index' pages
if url.endswith('_index/'):
url = url[:-7] + '/'
content.append({
'title': title,
'content': html_content,
'url': url
})
return content
def generate_json(content_dir, output_file, lang_code):
content = get_content(content_dir, lang_code)
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(content, f, ensure_ascii=False, indent=2)
print(f"Generated {output_file} with {len(content)} articles.")
# Paths
base_dir = 'content'
static_dir = 'static'
en_dir = os.path.join(base_dir, 'en')
ua_dir = os.path.join(base_dir, 'ua')
en_output = os.path.join(static_dir, 'content-en.json')
ua_output = os.path.join(static_dir, 'content-ua.json')
# Generate JSON files for both English and Ukrainian versions
generate_json(en_dir, en_output, 'en')
generate_json(ua_dir, ua_output, 'ua')
print("JSON generation complete.")