This stage attempts to cleanse the data, match to other sources of data and tag with categories (such as Individuals).
import pandas as pd
from thefuzz.process import extractOne, extractBests
import pipeline_utils.db as database
from config import RAW, WORKING, FUNDED_ORGS_LIST
Make direct matches to the company data
First, lets load the company data database.
db = database.connect(read_only=True)
Then we'll load our raw longlist into a temporary table.
db.sql(f'''CREATE TEMP TABLE tFundedOrgs AS SELECT DISTINCT organisation FROM read_csv('{FUNDED_ORGS_LIST}');''')
db.sql('''SELECT COUNT(*) AS Count FROM tFundedOrgs''')
┌───────┐ │ Count │ │ int64 │ ├───────┤ │ 511 │ └───────┘
We'll create a table of direct matches
db.sql('''
CREATE TEMP TABLE tDirect as SELECT r.*,
CompanyName as registered_name,
CompanyNumber as company_number,
"URI" as uri,
"RegAddress.PostTown" as post_town,
"RegAddress.PostCode" as postcode,
CompanyCategory as company_category,
CompanyStatus as company_status,
[x for x in [
"SICCode.SicText_1",
"SICCode.SicText_2",
"SICCode.SicText_3",
"SICCode.SicText_4"
] if x is not NULL] as sic_code,
IncorporationDate as incorporation_date,
DissolutionDate as dissolution_date,
"Accounts.AccountCategory" as accounts_category
-- , c.*
FROM tFundedOrgs r LEFT JOIN CompanyData c
ON upper(r.organisation) == c.CompanyName;
''')
ignored_companies = [
'Paul Miller'
]
direct_matches = db.sql(
'SELECT * from tDirect WHERE company_number IS NOT NULL'
).df().query('~organisation.isin(@ignored_companies)')
db.close()
direct_matches.sort_values(by='organisation').loc[: , ['organisation', 'company_number']].to_csv(WORKING / '2-company-match-direct.csv', index=False)
Fix typos in longlist
Having matched the details, let's see if we can fuzzy match missing items in the longlist.
First, let's get a list of organisations that have been matched to Company House data.
matched_organisations = direct_matches.organisation.unique().tolist()
Then load the raw longlist
raw = pd.read_csv(FUNDED_ORGS_LIST)
corrections = pd.concat(
[
raw,
raw.organisation.map(
lambda x: extractOne(x, matched_organisations, score_cutoff=90)
).apply(
pd.Series, index=['match', 'score']
)
], axis=1
).query(
'score.notna() and score < 100'
).loc[: ,['organisation', 'match']].set_index('organisation')
corrections
match | |
---|---|
organisation | |
Monkfish Productions CIC | Monkfish Productions CIO |
Moving Parts Arts | Moving Parts Arts CIO |
tiny dragon Productins | tiny dragon Productions |
corrections.to_csv(WORKING / '2-company-corrections.csv')
Fuzzy match company data
drop_list = pd.concat([direct_matches, corrections.reset_index()]).organisation
db = database.connect(read_only=True)
companies = db.query('''
SELECT DISTINCT CompanyName, CompanyNumber FROM CompanyData
WHERE CompanyStatus == 'Active'
AND "RegAddress.PostCode" SIMILAR TO '(NE|DH|SR).*'
ORDER BY CompanyName;
''').df()
db.close()
candidates = pd.read_csv(FUNDED_ORGS_LIST, usecols=[0]).organisation
def matcher(c, choices):
return extractBests(c, choices, score_cutoff=80)
candidate_list = candidates[~candidates.isin(drop_list)].unique().tolist()
matches = companies.CompanyName.str.replace(r'\W+', ' ', regex=True).apply(matcher, choices=candidate_list).rename('Matches')
res = pd.concat([companies, matches], axis=1).explode('Matches').dropna()
res['match'], res['score'] = zip(*res.Matches)
fuzzy_matches = res.loc[res.score > 90, ['match', 'CompanyName', 'CompanyNumber', 'score']]
fuzzy_matches['type'] = 'fuzzy'
fuzzy_matches.set_index('match').to_csv(WORKING / '2-company-match-fuzzy.csv')
Direct match charities
db = database.connect(read_only=True)
db.query(f'''
CREATE OR REPLACE TEMP TABLE tShortlist AS SELECT organisation FROM '{ FUNDED_ORGS_LIST }';
''')
charities = db.query('''
SELECT DISTINCT
s.*,
c.charity_name,
c.registered_charity_number,
c.charity_company_registration_number
FROM Charities c
JOIN tShortlist s
ON upper(s.organisation) == upper(c.charity_name)
ORDER BY charity_name
''').df(
)
charities.to_csv(WORKING / '2-charity-match-direct.csv')
db.close()
SIC Codes
db = database.connect(read_only=True)
Get all matched company numbers and store in a temp table.
db.query(
f'''
CREATE OR REPLACE TEMP TABLE tCompanyNumbers AS
SELECT *
FROM
(
SELECT company_number FROM read_csv('{ WORKING / '2-company-match-direct.csv' }')
UNION ALL
SELECT CompanyNumber AS company_number FROM read_csv('{ WORKING / '2-company-match-fuzzy.csv' }')
UNION ALL
SELECT charity_company_registration_number AS company_number FROM read_csv('{ WORKING / '2-charity-match-direct.csv' }')
)
'''
)
all_sic_codes = db.query(
'''
SELECT
[x for x in [
"SICCode.SicText_1",
"SICCode.SicText_2",
"SICCode.SicText_3",
"SICCode.SicText_4"
] if x is not NULL] as sic_code
FROM CompanyData c
JOIN tCompanyNumbers n
ON c.CompanyNumber == n.company_number
'''
)
excluded_sic_codes = [
'None Supplied',
'47610 - Retail sale of books in specialised stores',
'47990 - Other retail sale not in stores, stalls or markets',
'56302 - Public houses and bars',
'70210 - Public relations and communications activities',
'82990 - Other business support service activities n.e.c.',
'84110 - General public administration activities',
'85590 - Other education n.e.c.',
'85600 - Educational support services',
'87900 - Other residential care activities n.e.c.',
'88100 - Social work activities without accommodation for the elderly and disabled',
'88990 - Other social work activities without accommodation n.e.c.',
'93120 - Activities of sport clubs',
'93290 - Other amusement and recreation activities n.e.c.',
'94110 - Activities of business and employers membership organizations',
'94990 - Activities of other membership organizations n.e.c.',
'96090 - Other service activities n.e.c.',
]
pd.Series(
all_sic_codes.df().sic_code.explode().unique(),
name='sic_code'
).sort_values().pipe(
lambda s: s.loc[~s.isin(excluded_sic_codes)]
).to_csv(
WORKING / '2-sic-codes.csv', index=False
)