感觉你的场景,用 SQLite 也挺好呀。。
## 测试结果
*(同样表结构,每次开事务插入一行再提交)*
- 100W 行,104 秒,平均约 10000 TPS ,共 360MB
- 1000W 行,1450 秒,平均约 7000 TPS ,共 3.6GB
## 环境
- CPU:i5-8250U (六七年前的低压轻薄本)
- 内存:测试时占用 14 MB
- 系统:Deepin V20 (基于 Debian 10 )
- 软件:Python 3.9
- 固态:顺序:500 MB/s 读,300 MB/s 写;随机:20 MB/s 读,64 MB/s 写
## 代码
```python
# V 站吞空格,缩进改为全角空格了
import time
import random
import hashlib
import sqlite3
ROWS_PER_INSERT = 1
ROWS_TOTAL = 1000_0000
DB_PATH = '/数据库存放路径/名称.db'
db = sqlite3.connect(DB_PATH)
db.execute('PRAGMA journal_mode = WAL')
db.execute('PRAGMA synchronous = NORMAL')
db.execute('PRAGMA wal_autocheckpoint = 10000')
db.execute('''
CREATE TABLE IF NOT EXISTS phishtank_database (
id INTEGER PRIMARY KEY,
phish_id INT,
url TEXT,
url_sha256 TEXT UNIQUE,
phish_detail_url TEXT,
submission_time DATETIME,
verified TEXT,
verification_time DATETIME,
online TEXT,
target TEXT,
created_at DATETIME DEFAULT (datetime('now', 'localtime')),
etag TEXT
)
''')
insert_sql = '''
INSERT OR IGNORE INTO phishtank_database
(phish_id, url, url_sha256, phish_detail_url, submission_time, verified, verification_time, online, target, etag)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
'''
first_id, = db.execute('SELECT IFNULL(MAX(id), 0) + 1 FROM phishtank_database').fetchone()
for next_id in range(first_id, first_id + ROWS_TOTAL, ROWS_PER_INSERT):
rows = []
now = time.strftime('%Y-%m-%d %H:%M:%S')
for phish_id in range(next_id, min(next_id + ROWS_PER_INSERT, first_id + ROWS_TOTAL)):
url = f'https://free-{phish_id}.webadorsite.com/'
detail_url = f'
http://www.phishtank.com/phish_detail.php?phish_id={phish_id}'
rows.append((
phish_id, url, hashlib.sha256(url.encode('utf-8')).hexdigest(),
detail_url, now, 'yes', now, 'yes', 'Other', random.randbytes(16).hex(),
))
with db:
db.executemany(insert_sql, rows)
```