An imaginary demo of how I might automate a job-matching exercise between two companies. @test
Code
import osimport sysimport pandas as pdimport difflib#from thefuzz import fuzz, process# pyright: reportMissingImports=falseimport regex as refrom ftfy import fix_textfrom sklearn.neighbors import NearestNeighborsfrom sklearn.feature_extraction.text import TfidfVectorizerimport numpy as npfrom scipy.sparse import csr_matriximport sparse_dot_topn.sparse_dot_topn as ctimport timet1 = time.time()pd.set_option('display.max_columns', None) #show all columns when using head()pd.options.display.float_format ='{:.2f}'.formatnp.set_printoptions(suppress=True) # suppres scientific notationobj_path ="C:\\Users\\johna\\OneDrive\\Documents\\Python\\py_projects\\proj-jobmatch\\"sys.path.insert(0, obj_path)#from call_logger import get_logger# global LOGGER# LOGGER = get_logger(os.path.abspath(__name__))# LOGGER.info(f"Starting the Script : {__name__}")# -----------------------Company Job Data--------------------------------company_a = pd.read_csv(obj_path +'company_a_jobs.csv', encoding='cp1252')company_a['MATCH_COLUMN'] = company_a['Job Title'] +'_'+ company_a['Description']company_b = pd.read_csv(obj_path +'company_b_jobs.csv', encoding='cp1252')company_b['MATCH_COLUMN'] = company_b['Job Title'] +'_'+ company_b['Description']# -----------------------Customer master data: all residents since 2017 by the most recent agreement --------------------------------def ngrams(string, n=3): string =str(string) string = fix_text(string) # fix text string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars string = string.lower() chars_to_remove = [")","(",".","|","[","]","{","}","'"] rx ='['+ re.escape(''.join(chars_to_remove)) +']' string = re.sub(rx, '', string) string = string.replace('&', 'and') string = string.replace(',', ' ') string = string.replace('-', ' ') string = string.replace('_', ' ') string = string.title() # normalise case - capital at start of each word string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single string =' '+ string +' '# pad names for ngrams... string = re.sub(r'[,-./]|\sBD',r'', string) ngrams =zip(*[string[i:] for i inrange(n)])return [''.join(ngram) for ngram in ngrams]unique_a = company_a['MATCH_COLUMN'].unique().astype('U')unique_b = company_b['MATCH_COLUMN'].unique().astype('U')print('Vectorizing the data - this could take a few minutes for large datasets...')vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)######Nearest neighbors on matching column ########tfidf = vectorizer.fit_transform(unique_a.astype('U'))nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)###matching query:def getNearestN(query): queryTFIDF_ = vectorizer.transform(query) distances, indices = nbrs.kneighbors(queryTFIDF_)return distances, indicesprint('getting nearest n...')distances, indices = getNearestN(unique_b)unique_b =list(unique_b) #need to convert back to a listprint('finding matches...')matches = []for i,j inenumerate(indices): temp = [round(distances[i][0],2), unique_a[j][0],unique_b[i]] matches.append(temp)print('Building data frame...')matches = pd.DataFrame(matches)matches = matches.rename({0: "Match confidence (lower is better)",1: "MATCH_COLUMN_A",2:"MATCH_COLUMN_B"}, axis=1)print('Done')t = time.time()-t1print("COMPLETED IN:", t)#bring customer master data back inmatches_a = company_a.merge(matches, left_on='MATCH_COLUMN', right_on='MATCH_COLUMN_A', how='left', suffixes=('','_A'))#merge back with pcc datamatches_all = pd.merge(matches_a, company_b, left_on='MATCH_COLUMN_B', right_on='MATCH_COLUMN', how='left', suffixes=('','_B'))matches_all = matches_all.iloc[: , :-2]result = matches_allresult['Match confidence (lower is better)'] = result['Match confidence (lower is better)'].replace(np.nan, 0)#cm_tbl = matches.groupby(["CMID"]).apply(lambda x: x.nsmallest(1, columns = ['Match confidence (lower is better)'])).reset_index(drop=True)best_match_score = result[result['Match confidence (lower is better)'] == result.groupby('Job Code', dropna=False)['Match confidence (lower is better)'].transform('min')].reset_index(drop=True)best_match_score.to_csv(obj_path +'best_match_score.csv', index=False)
Vectorizing the data - this could take a few minutes for large datasets...
getting nearest n...
finding matches...
Building data frame...
Done
COMPLETED IN: 14.447463750839233