Moe’s Notes

fast list matching in python

I have been trying to match the strings between two large CSV files, on the order of 250,000 rows.

I found a blog post by van den Berg solving a similiar problem.

Here are the modifications I made to suit the approch to the problem I had:

def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]


def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))


def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})
        

def get_csr_ntop_idx_data(csr_row, ntop):
    """
    Get list (row index, score) of the n top matches
    """
    nnz = csr_row.getnnz()
    if nnz == 0:
        return None
    elif nnz <= ntop:
        result = zip(gt_csr_row.indices, csr_row.data)
    else:
        arg_idx = np.argpartition(csr_row.data, -ntop)[-ntop:]
        result = zip(csr_row.indices[arg_idx], csr_row.data[arg_idx])

    return sorted(result, key=lambda x: -x[1])

def scipy_cossim_top(A, B, ntop, lower_bound=0):
    C = A.dot(B)
    return [get_csr_ntop_idx_data(row, ntop) for row in C]

with open(DONATIONS_CLEAN, 'r') as f_clean:
    clean_lines = []
    for line in f_clean:
        clean_lines.append(line)
    clean = pd.DataFrame(clean_lines).drop_duplicates()

with open(DONATIONS_DIRTY, 'r') as f_dirty:
    dirty_lines = []
    for line in f_dirty:
        dirty_lines.append(line)

vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
dirty_idf_matrix = vectorizer.fit_transform(dirty_lines)
clean_idf_matrix = vectorizer.transform(clean_lines)
matches = awesome_cossim_top(dirty_idf_matrix, clean_idf_matrix.T, 1, 0)

with open(DONATIONS_MATCHED, 'a') as output:
    for dirty_idx, _ in enumerate(dirty_lines):
        clean_idx = matches[dirty_idx].argmax()
        output.write(clean_lines[clean_idx])