-
Notifications
You must be signed in to change notification settings - Fork 123
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
create and insert google pinyin with abbr of pinyin to sqlite3 db use…
… Python 3.12.2
- Loading branch information
Showing
3 changed files
with
72 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import sqlite3 | ||
|
||
# Path to the SQLite database file | ||
db_file = 'pinyin_data.sqlite3' | ||
|
||
# Connect to the database. This will create the file if it doesn't exist. | ||
conn = sqlite3.connect(db_file) | ||
|
||
# Create a cursor object using the connection | ||
cur = conn.cursor() | ||
|
||
# SQL statement to create the pinyin_data table | ||
# hz hanzhi | ||
# py pinyin | ||
# abbr abbreviation | ||
# freq frequency | ||
create_table_sql = """ | ||
CREATE TABLE IF NOT EXISTS pinyin_data ( | ||
id INTEGER PRIMARY KEY AUTOINCREMENT, | ||
hz TEXT NOT NULL, | ||
py TEXT NOT NULL, | ||
abbr TEXT NOT NULL, | ||
freq REAL NOT NULL | ||
); | ||
""" | ||
|
||
# Execute the SQL statement to create the table | ||
cur.execute(create_table_sql) | ||
|
||
# SQL statements to create indexes on the pinyin and abbreviation columns | ||
create_index_pinyin_sql = "CREATE INDEX IF NOT EXISTS idx_pinyin ON pinyin_data(py);" | ||
create_index_abbreviation_sql = "CREATE INDEX IF NOT EXISTS idx_abbr ON pinyin_data(abbr);" | ||
|
||
# Execute the SQL statements to create the indexes | ||
cur.execute(create_index_pinyin_sql) | ||
cur.execute(create_index_abbreviation_sql) | ||
|
||
# Commit the changes | ||
conn.commit() | ||
|
||
# Close the connection | ||
conn.close() | ||
|
||
print("Database, table, and indexes created successfully.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import sqlite3 | ||
|
||
# Connect to your SQLite database | ||
conn = sqlite3.connect('pinyin_data.sqlite3') | ||
c = conn.cursor() | ||
|
||
# Open and read your data file | ||
with open('../google_pinyin_rawdict_utf16_65105_freq.txt', 'r', encoding='utf-16') as file: | ||
for line in file: | ||
# Split the line into components | ||
parts = line.strip().split(' ') | ||
|
||
# Omit the '0' and reconstruct the line if necessary | ||
# Assuming the format is consistent and '0' always appears at the third position | ||
if parts[2] == '0': | ||
phrase = parts[0] | ||
score = parts[1] | ||
pinyin = ''.join(parts[3:]) # Join the remaining parts as the pinyin | ||
abbreviation = ''.join([p[0] for p in parts[3:]]) # Create the abbreviation from the pinyin parts | ||
print(phrase, pinyin, abbreviation, float(score)) | ||
# Execute the insert command | ||
c.execute('INSERT INTO pinyin_data (hz, py, abbr, freq) VALUES (?, ?, ?, ?)', | ||
(phrase, pinyin, abbreviation, float(score))) | ||
|
||
|
||
# Commit the changes and close the connection | ||
conn.commit() | ||
conn.close() |
Binary file not shown.