V2EX = way to explore
V2EX 是一个关于分享和探索的地方
现在注册
已注册用户请  登录
• 请不要在回答技术问题时复制粘贴 AI 生成的内容
777777
V2EX  ›  程序员

go 协程比 Python 多进程快好多!

  •  
  •   777777 · 2023-12-13 15:04:09 +08:00 · 931 次点击
    这是一个创建于 387 天前的主题,其中的信息可能已经有所发展或是发生改变。

    需求:2 个 list(alist,blist),alist 每个值与 blist 每个值做字符串相似度计算,两个 list 数量级为 20 万 下面为 python 和 go 的代码片段 python:

    # 计算两个字符串的相似度
    def similar(a, b):
        """计算两个字符串的相似度。如果有一个是 None ,则返回 0 。"""
        if a is None or b is None:
            return 0
        similarity = fuzz.token_set_ratio(a, b.lower()) / 100
        print("similar a:", a, ",", "b:", b, ", similarity:", similarity)
        return similarity
    
    
    def compute_similarity(args):
        record_name, name = args
        return similar(record_name, name), name
    
    
    # 更新数据库记录
    def update_database(cursor, name_mapping, csv_data):
        update_sql = "UPDATE tweb_fingerprint_test SET factory = %s WHERE uuid = %s"
        num = 0
    
        # 创建一个反向映射,使我们可以快速地通过名称查找 UUID
        name_to_uuid = defaultdict(list)
        for uuid, names in name_mapping.items():
            for name in names:
                if name:  # 检查 name 是否为 None 或空
                    name_to_uuid[name].append(uuid)
    
        updates = []
        with ProcessPoolExecutor() as executor:
            for row in csv_data:
                vendor_name = row.get("vendor")
                record_name = row.get("name")
                print("record_name:", record_name)
                if (
                    vendor_name is None
                    or record_name is None
                    or vendor_name in ["未知", "None"]
                ):
                    continue  # 跳过这行数据
    
                # 直接查找名称
                uuids_to_update = name_to_uuid.get(record_name, [])
    
                # 如果没有直接匹配,尝试查找相似度超过 98%的名称
                if not uuids_to_update:
                    tasks = [(record_name, name) for name in name_to_uuid]
                    results = executor.map(compute_similarity, tasks)
                    uuids_to_update.extend(
                        name_to_uuid[name]
                        for similarity, name in results
                        if similarity > 0.98
                    )
    
                # 如果找到 UUID ,加入到更新列表中
                for uuid_to_update in uuids_to_update:
                    updates.append((vendor_name, uuid_to_update))
    
        # 批量更新
        if updates:
            cursor.executemany(update_sql, updates)
            num = len(updates)
    
        # 返回更新的记录数
        return num
    

    go:

    // Similar calculates the similarity between two strings
    func Similar(a, b string) float64 {
    	return smetrics.JaroWinkler(a, b, 0.7, 4)
    }
    
    // UpdateDatabase updates the database with the new vendor information
    // UpdateDatabase updates the database with the new vendor information
    func UpdateDatabase(db *sql.DB, vendors map[string]Vendor, records []CSVRecord) (int, error) {
    	fmt.Println("records", len(records))
    	fmt.Println("vendors", len(vendors))
    	stmt, err := db.Prepare("UPDATE tweb_fingerprint SET factory = ? WHERE uuid = ?")
    	if err != nil {
    		return 0, err
    	}
    	defer stmt.Close()
    
    	var wg sync.WaitGroup
    	updates := make(chan Updatedata, len(records))
    
    	for _, record := range records {
    		wg.Add(1)
    		go func(record CSVRecord) {
    			defer wg.Done()
    			// fmt.Println(record.Name)
    			for _, vendor := range vendors {
    				if record.Name == vendor.Name.String || Similar(record.Name, vendor.Name.String) > SimilarityThreshold {
    					updates <- Updatedata{
    						UUID:    vendor.UUID,
    						Factory: record.Vendor,
    					}
    				}
    			}
    		}(record)
    	}
    
    	go func() {
    		wg.Wait()
    		close(updates)
    	}()
    
    	count := 0
    	for update := range updates {
    		fmt.Println("update:", update)
    		if _, err := stmt.Exec(update.Factory, update.UUID); err != nil {
    			return count, err
    		}
    		count++
    	}
    
    	return count, nil
    }
    
    Baloneo
        1
    Baloneo  
       2023-12-13 16:27:22 +08:00
    快多少?
    777777
        2
    777777  
    OP
       2023-12-13 17:23:30 +08:00
    @Baloneo 至少 10 倍吧,python CPU 都打不满,没跑完我就重构成 go 了,go 十分钟就跑完了
    关于   ·   帮助文档   ·   博客   ·   API   ·   FAQ   ·   实用小工具   ·   2859 人在线   最高记录 6679   ·     Select Language
    创意工作者们的社区
    World is powered by solitude
    VERSION: 3.9.8.5 · 830ms · UTC 12:31 · PVG 20:31 · LAX 04:31 · JFK 07:31
    Developed with CodeLauncher
    ♥ Do have faith in what you're doing.