This stage attempts to cleanse the data, match to other sources of data and tag with categories (such as Individuals).

import pandas as pd
from thefuzz.process import extractOne, extractBests

import pipeline_utils.db as database

from config import RAW, WORKING, FUNDED_ORGS_LIST

Make direct matches to the company data

First, lets load the company data database.

db = database.connect(read_only=True)

Then we'll load our raw longlist into a temporary table.

db.sql(f'''CREATE TEMP TABLE tFundedOrgs AS SELECT DISTINCT organisation FROM read_csv('{FUNDED_ORGS_LIST}');''')
db.sql('''SELECT COUNT(*) AS Count FROM tFundedOrgs''')

┌───────┐ │ Count │ │ int64 │ ├───────┤ │ 511 │ └───────┘

We'll create a table of direct matches

db.sql('''
       CREATE TEMP TABLE tDirect as SELECT r.*,
              CompanyName as registered_name,
              CompanyNumber as company_number,
              "URI" as uri,
              "RegAddress.PostTown" as post_town,
              "RegAddress.PostCode" as postcode,
              CompanyCategory as company_category,
              CompanyStatus as company_status,
              [x for x in [
                     "SICCode.SicText_1",
                     "SICCode.SicText_2",
                     "SICCode.SicText_3",
                     "SICCode.SicText_4"
              ] if x is not NULL] as sic_code,
              IncorporationDate as incorporation_date,
              DissolutionDate as dissolution_date,
              "Accounts.AccountCategory" as accounts_category
              -- , c.*
                        
       FROM tFundedOrgs r LEFT JOIN CompanyData c
       ON upper(r.organisation) == c.CompanyName;
''')
ignored_companies = [
    'Paul Miller'
]
direct_matches = db.sql(
    'SELECT * from tDirect WHERE company_number IS NOT NULL'
).df().query('~organisation.isin(@ignored_companies)')
db.close()
direct_matches.sort_values(by='organisation').loc[: , ['organisation', 'company_number']].to_csv(WORKING / '2-company-match-direct.csv', index=False)

Fix typos in longlist

Having matched the details, let's see if we can fuzzy match missing items in the longlist.

First, let's get a list of organisations that have been matched to Company House data.

matched_organisations = direct_matches.organisation.unique().tolist()

Then load the raw longlist

raw = pd.read_csv(FUNDED_ORGS_LIST)
corrections = pd.concat(
    [
        raw,
        raw.organisation.map(
            lambda x: extractOne(x, matched_organisations, score_cutoff=90)
        ).apply(
            pd.Series, index=['match', 'score']
        )
    ], axis=1
).query(
    'score.notna() and score < 100'
).loc[: ,['organisation', 'match']].set_index('organisation')
corrections

match
organisation
Monkfish Productions CIC Monkfish Productions CIO
Moving Parts Arts Moving Parts Arts CIO
tiny dragon Productins tiny dragon Productions
corrections.to_csv(WORKING / '2-company-corrections.csv')

Fuzzy match company data

drop_list = pd.concat([direct_matches, corrections.reset_index()]).organisation
db = database.connect(read_only=True)
companies = db.query('''
                        SELECT DISTINCT CompanyName, CompanyNumber FROM CompanyData
                        WHERE CompanyStatus == 'Active'
                        AND "RegAddress.PostCode" SIMILAR TO '(NE|DH|SR).*'
                        ORDER BY CompanyName;
''').df()
db.close()
candidates = pd.read_csv(FUNDED_ORGS_LIST, usecols=[0]).organisation
def matcher(c, choices):
    return extractBests(c, choices, score_cutoff=80)
candidate_list = candidates[~candidates.isin(drop_list)].unique().tolist()
matches = companies.CompanyName.str.replace(r'\W+', ' ', regex=True).apply(matcher, choices=candidate_list).rename('Matches')
res = pd.concat([companies, matches], axis=1).explode('Matches').dropna()
res['match'], res['score'] = zip(*res.Matches)

fuzzy_matches = res.loc[res.score > 90, ['match', 'CompanyName', 'CompanyNumber', 'score']]
fuzzy_matches['type'] = 'fuzzy'
fuzzy_matches.set_index('match').to_csv(WORKING / '2-company-match-fuzzy.csv')

Direct match charities

db = database.connect(read_only=True)
db.query(f'''
         CREATE OR REPLACE TEMP TABLE tShortlist AS SELECT organisation FROM '{ FUNDED_ORGS_LIST }';
         ''')
charities = db.query('''
         SELECT DISTINCT
            s.*,
            c.charity_name,
            c.registered_charity_number,
            c.charity_company_registration_number
         FROM Charities c
         JOIN tShortlist s
         ON upper(s.organisation) == upper(c.charity_name)
         ORDER BY charity_name
         ''').df(
         )
charities.to_csv(WORKING / '2-charity-match-direct.csv')
db.close()

SIC Codes

db = database.connect(read_only=True)

Get all matched company numbers and store in a temp table.

db.query(
    f'''
    CREATE OR REPLACE TEMP TABLE tCompanyNumbers AS
    SELECT *
    FROM
    (
        SELECT company_number FROM read_csv('{ WORKING / '2-company-match-direct.csv' }')
        UNION ALL
        SELECT CompanyNumber AS company_number FROM read_csv('{ WORKING / '2-company-match-fuzzy.csv' }')
        UNION ALL
        SELECT charity_company_registration_number AS company_number FROM read_csv('{ WORKING / '2-charity-match-direct.csv' }')
    )
    '''
)
all_sic_codes = db.query(
    '''
    SELECT
        [x for x in [
            "SICCode.SicText_1",
            "SICCode.SicText_2",
            "SICCode.SicText_3",
            "SICCode.SicText_4"
        ] if x is not NULL] as sic_code
    FROM CompanyData c
    JOIN tCompanyNumbers n
    ON c.CompanyNumber == n.company_number
    '''
)
excluded_sic_codes = [
    'None Supplied',
    '47610 - Retail sale of books in specialised stores',
    '47990 - Other retail sale not in stores, stalls or markets',
    '56302 - Public houses and bars',
    '70210 - Public relations and communications activities',
    '82990 - Other business support service activities n.e.c.',
    '84110 - General public administration activities',
    '85590 - Other education n.e.c.',
    '85600 - Educational support services',
    '87900 - Other residential care activities n.e.c.',
    '88100 - Social work activities without accommodation for the elderly and disabled',
    '88990 - Other social work activities without accommodation n.e.c.',
    '93120 - Activities of sport clubs',
    '93290 - Other amusement and recreation activities n.e.c.',
    '94110 - Activities of business and employers membership organizations',
    '94990 - Activities of other membership organizations n.e.c.',
    '96090 - Other service activities n.e.c.',
]
pd.Series(
    all_sic_codes.df().sic_code.explode().unique(),
    name='sic_code'
).sort_values().pipe(
    lambda s: s.loc[~s.isin(excluded_sic_codes)]
).to_csv(
    WORKING / '2-sic-codes.csv', index=False
)

Identify possible individuals