Moe’s Notes

fast list matching in python

I have been trying to match the strings between two large CSV files, on the order of 250,000 rows.

I found a blog post by van den Berg solving a similiar problem.

Here are the modifications I made to suit the approch to the problem I had:

def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
    idx_dtype = np.int32
    nnz_max = M*ntop
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    if top:
        nr_matches = top
        nr_matches = sparsecols.size
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] =[index]
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

def get_csr_ntop_idx_data(csr_row, ntop):
    Get list (row index, score) of the n top matches
    nnz = csr_row.getnnz()
    if nnz == 0:
        return None
    elif nnz <= ntop:
        result = zip(gt_csr_row.indices,
        arg_idx = np.argpartition(, -ntop)[-ntop:]
        result = zip(csr_row.indices[arg_idx],[arg_idx])

    return sorted(result, key=lambda x: -x[1])

def scipy_cossim_top(A, B, ntop, lower_bound=0):
    C =
    return [get_csr_ntop_idx_data(row, ntop) for row in C]

with open(DONATIONS_CLEAN, 'r') as f_clean:
    clean_lines = []
    for line in f_clean:
    clean = pd.DataFrame(clean_lines).drop_duplicates()

with open(DONATIONS_DIRTY, 'r') as f_dirty:
    dirty_lines = []
    for line in f_dirty:

vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
dirty_idf_matrix = vectorizer.fit_transform(dirty_lines)
clean_idf_matrix = vectorizer.transform(clean_lines)
matches = awesome_cossim_top(dirty_idf_matrix, clean_idf_matrix.T, 1, 0)

with open(DONATIONS_MATCHED, 'a') as output:
    for dirty_idx, _ in enumerate(dirty_lines):
        clean_idx = matches[dirty_idx].argmax()