Skip to content

Commit

Permalink
create and insert google pinyin with abbr of pinyin to sqlite3 db use…
Browse files Browse the repository at this point in the history
… Python 3.12.2
  • Loading branch information
dongyuwei committed Apr 5, 2024
1 parent db5db28 commit 0390a3c
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 0 deletions.
44 changes: 44 additions & 0 deletions dictionary/sqlite/create_pinyin_db_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import sqlite3

# Path to the SQLite database file
db_file = 'pinyin_data.sqlite3'

# Connect to the database. This will create the file if it doesn't exist.
conn = sqlite3.connect(db_file)

# Create a cursor object using the connection
cur = conn.cursor()

# SQL statement to create the pinyin_data table
# hz hanzhi
# py pinyin
# abbr abbreviation
# freq frequency
create_table_sql = """
CREATE TABLE IF NOT EXISTS pinyin_data (
id INTEGER PRIMARY KEY AUTOINCREMENT,
hz TEXT NOT NULL,
py TEXT NOT NULL,
abbr TEXT NOT NULL,
freq REAL NOT NULL
);
"""

# Execute the SQL statement to create the table
cur.execute(create_table_sql)

# SQL statements to create indexes on the pinyin and abbreviation columns
create_index_pinyin_sql = "CREATE INDEX IF NOT EXISTS idx_pinyin ON pinyin_data(py);"
create_index_abbreviation_sql = "CREATE INDEX IF NOT EXISTS idx_abbr ON pinyin_data(abbr);"

# Execute the SQL statements to create the indexes
cur.execute(create_index_pinyin_sql)
cur.execute(create_index_abbreviation_sql)

# Commit the changes
conn.commit()

# Close the connection
conn.close()

print("Database, table, and indexes created successfully.")
28 changes: 28 additions & 0 deletions dictionary/sqlite/insert_pinyin_to_sqlite_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import sqlite3

# Connect to your SQLite database
conn = sqlite3.connect('pinyin_data.sqlite3')
c = conn.cursor()

# Open and read your data file
with open('../google_pinyin_rawdict_utf16_65105_freq.txt', 'r', encoding='utf-16') as file:
for line in file:
# Split the line into components
parts = line.strip().split(' ')

# Omit the '0' and reconstruct the line if necessary
# Assuming the format is consistent and '0' always appears at the third position
if parts[2] == '0':
phrase = parts[0]
score = parts[1]
pinyin = ''.join(parts[3:]) # Join the remaining parts as the pinyin
abbreviation = ''.join([p[0] for p in parts[3:]]) # Create the abbreviation from the pinyin parts
print(phrase, pinyin, abbreviation, float(score))
# Execute the insert command
c.execute('INSERT INTO pinyin_data (hz, py, abbr, freq) VALUES (?, ?, ?, ?)',
(phrase, pinyin, abbreviation, float(score)))


# Commit the changes and close the connection
conn.commit()
conn.close()
Binary file added dictionary/sqlite/pinyin_data.sqlite3
Binary file not shown.

0 comments on commit 0390a3c

Please sign in to comment.