scikit-learn demo

An imaginary demo of how I might automate a job-matching exercise between two companies. @test

Code
import os
import sys
import pandas as pd
import difflib
#from thefuzz import fuzz, process
# pyright: reportMissingImports=false
import regex as re
from ftfy import fix_text
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct
import time


t1 = time.time()

pd.set_option('display.max_columns', None) #show all columns when using head()
pd.options.display.float_format = '{:.2f}'.format
np.set_printoptions(suppress=True) # suppres scientific notation


obj_path = "C:\\Users\\johna\\OneDrive\\Documents\\Python\\py_projects\\proj-jobmatch\\"
sys.path.insert(0, obj_path)
#from call_logger import get_logger

# global LOGGER
# LOGGER = get_logger(os.path.abspath(__name__))
# LOGGER.info(f"Starting the Script : {__name__}")


# -----------------------Company Job Data--------------------------------

company_a = pd.read_csv(obj_path + 'company_a_jobs.csv', encoding='cp1252')
company_a['MATCH_COLUMN'] = company_a['Job Title'] + '_' + company_a['Description']

company_b = pd.read_csv(obj_path + 'company_b_jobs.csv', encoding='cp1252')
company_b['MATCH_COLUMN'] = company_b['Job Title'] + '_' + company_b['Description']


# -----------------------Customer master data: all residents since 2017 by the most recent agreement --------------------------------

def ngrams(string, n=3):
    string = str(string)
    string = fix_text(string) # fix text
    string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
    string = string.lower()
    chars_to_remove = [")","(",".","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.replace('_', ' ')
    string = string.title() # normalise case - capital at start of each word
    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single
    string = ' '+ string +' ' # pad names for ngrams...
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

unique_a = company_a['MATCH_COLUMN'].unique().astype('U')
unique_b = company_b['MATCH_COLUMN'].unique().astype('U')

print('Vectorizing the data - this could take a few minutes for large datasets...')
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)

######Nearest neighbors on matching column ########
tfidf = vectorizer.fit_transform(unique_a.astype('U'))
nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)

###matching query:
def getNearestN(query):
  queryTFIDF_ = vectorizer.transform(query)
  distances, indices = nbrs.kneighbors(queryTFIDF_)
  return distances, indices

print('getting nearest n...')
distances, indices = getNearestN(unique_b)

unique_b = list(unique_b) #need to convert back to a list
print('finding matches...')
matches = []
for i,j in enumerate(indices):
  temp = [round(distances[i][0],2), unique_a[j][0],unique_b[i]]
  matches.append(temp)


print('Building data frame...')
matches = pd.DataFrame(matches)

matches = matches.rename({0: "Match confidence (lower is better)",
                1: "MATCH_COLUMN_A",
                2:"MATCH_COLUMN_B"}, axis=1)

print('Done')
t = time.time()-t1
print("COMPLETED IN:", t)

#bring customer master data back in
matches_a = company_a.merge(matches, left_on='MATCH_COLUMN', right_on='MATCH_COLUMN_A', how='left', suffixes=('','_A'))

#merge back with pcc data
matches_all = pd.merge(matches_a, company_b, left_on='MATCH_COLUMN_B', right_on='MATCH_COLUMN', how='left', suffixes=('','_B'))

matches_all = matches_all.iloc[: , :-2]

result = matches_all

result['Match confidence (lower is better)'] = result['Match confidence (lower is better)'].replace(np.nan, 0)

#cm_tbl = matches.groupby(["CMID"]).apply(lambda x: x.nsmallest(1, columns = ['Match confidence (lower is better)'])).reset_index(drop=True)
best_match_score = result[result['Match confidence (lower is better)'] == result.groupby('Job Code', dropna=False)['Match confidence (lower is better)'].transform('min')].reset_index(drop=True)

best_match_score.to_csv(obj_path + 'best_match_score.csv', index=False)  
Vectorizing the data - this could take a few minutes for large datasets...
getting nearest n...
finding matches...
Building data frame...
Done
COMPLETED IN: 14.447463750839233