This stage merges the raw longlist with the candidate data from the reference sources to create the new list.
It round-trips the data from the existing landscape list to ensure that any manual overrides are included.
import petl as etl
from pipeline_utils.reference.geo import la_code_lookup
from pipeline_utils.reference.onspd import normalise_postcode, postcode_lookup
from pipeline_utils.filesystem.paths import RAW_DATA, DATA
import pipeline_utils.db as database
from config import WORKING
Load the spelling corrections we have inferred from the matching stage.
corrections = etl.fromcsv(WORKING / '2-company-corrections.csv').lookupone('organisation', 'match')
Load the untagged longlist from the raw directory and perform the following operations:
- Convert numeric data to numbers
- Correct the spellings of the organisational data
- Augment with local authority data
raw = etl.fromcsv(
WORKING / 'funded-organisations.csv'
).convertnumbers(
).convert(
'organisation', lambda x: corrections.get(x, x)
).convert(
'Local authority', la_code_lookup
).unpackdict(
'Local authority'
).rename({
'LAD24CD': 'funding_geo_code',
'LAD24NM': 'funding_geo_name',
}).cache()
raw
organisation | Source | Number | funding_geo_code | funding_geo_name |
---|---|---|---|---|
4M Puppets | Project Grant | 1 | E08000021 | Newcastle upon Tyne |
Abdulrahman Abu - Zayd | Project Grant | 1 | E08000037 | Gateshead |
Action for Children | Project Grant | 1 | E06000057 | Northumberland |
Adam Phillips | Project Grant | 3 | E08000037 | Gateshead |
Adam Shield | Project Grant | 1 | E08000021 | Newcastle upon Tyne |
...
Get the list of sources in the longlist. We'll use this to update the values in the columns later on.
sources = tuple(raw.cut('Source').distinct().values('Source'))
sources
('IPSO', 'NPO', 'Project Grant')
Recast the longlist to convert the Source column into a column per entry, and convert any non-None values into True
wide_table = raw.recast(
variablefield="Source",
valuefield="Number"
).convert(
sources,
lambda x: True if x is not None else None
)
wide_table
organisation | funding_geo_code | funding_geo_name | IPSO | NPO | Project Grant |
---|---|---|---|---|---|
4M Puppets | E08000021 | Newcastle upon Tyne | None | None | True |
Abdulrahman Abu - Zayd | E08000037 | Gateshead | None | None | True |
Action for Children | E06000057 | Northumberland | None | None | True |
Adam Phillips | E08000037 | Gateshead | None | None | True |
Adam Shield | E08000021 | Newcastle upon Tyne | None | None | True |
...
At this point we will also add in new data from the result of stage 2.
location
Manually set locationscompanies
Company data from Companies house (direct and fuzzy matched)
location = etl.fromcsv(RAW_DATA / 'landscape-locations.csv').lookupone('organisation', ['latitude', 'longitude'])
Direct and fuzzy data is loaded from the database.
db = database.connect(read_only=True)
db.query(f'''
CREATE OR REPLACE TEMP TABLE tCompanies AS
SELECT match as organisation, CompanyNumber as company_number, type, score
FROM read_csv('{ WORKING / '2-company-match-fuzzy.csv' }')
UNION ALL
SELECT organisation, charity_company_registration_number AS company_number, 'charity' AS type, 100 AS score
FROM read_csv('{ WORKING / '2-charity-match-direct.csv' }')
UNION ALL
SELECT *, 'direct' AS type, 100 as score
FROM read_csv('{ WORKING / '2-company-match-direct.csv' }');
CREATE OR REPLACE TEMP TABLE tCharities AS
SELECT *
FROM read_csv('{ WORKING / '2-charity-match-direct.csv' }');
CREATE OR REPLACE TEMP TABLE tSicCodes AS
SELECT *
FROM read_csv('{ WORKING / '2-sic-codes.csv' }');
CREATE OR REPLACE TEMP TABLE tPostcodes AS
SELECT pcds AS postcode, lat, long
FROM read_csv('{ DATA / 'reference/onspd_extract.csv' }')
WHERE oslaua == 'E08000021';
CREATE OR REPLACE TEMP TABLE tCompanyExtract AS
SELECT
CompanyName as registered_name,
CompanyNumber as company_number,
"URI" as uri,
"RegAddress.PostTown" as post_town,
"RegAddress.PostCode" as postcode,
CompanyCategory as company_category,
CompanyStatus as company_status,
[x for x in [
"SICCode.SicText_1",
"SICCode.SicText_2",
"SICCode.SicText_3",
"SICCode.SicText_4"
] if x is not NULL] as sic_code,
IncorporationDate as incorporation_date,
DissolutionDate as dissolution_date,
"Accounts.AccountCategory" as accounts_category,
lat AS latitude, long as longitude,
FROM CompanyData c
JOIN tPostcodes p
ON c."RegAddress.PostCode" == p.postcode;
;
''')
db.query('''
SELECT * from tCompanies;
''')
┌─────────────────────────────────────┬────────────────┬─────────┬───────┐ │ organisation │ company_number │ type │ score │ │ varchar │ varchar │ varchar │ int64 │ ├─────────────────────────────────────┼────────────────┼─────────┼───────┤ │ Allenheads Contemporary Arts │ 06764121 │ fuzzy │ 95 │ │ Amber Film & Photography Collective │ 07218282 │ fuzzy │ 95 │ │ a-n The Artists Information Company │ 01626331 │ fuzzy │ 92 │ │ Bloodaxe Books Ltd │ 15402049 │ fuzzy │ 95 │ │ Changing Relations │ 08715299 │ fuzzy │ 95 │ │ Cloud Nine Theatre Productions │ 04095014 │ fuzzy │ 95 │ │ Eliot Smith Company │ 09442391 │ fuzzy │ 95 │ │ Equal Arts │ 01992359 │ fuzzy │ 95 │ │ Generator North East │ 03670235 │ fuzzy │ 95 │ │ Hexham Book Festival │ 06630555 │ fuzzy │ 95 │ │ · │ · │ · │ · │ │ · │ · │ · │ · │ │ · │ · │ · │ · │ │ The Witham Hall Ltd │ 06959661 │ direct │ 100 │ │ Theatre Space North East CIC │ 10059355 │ direct │ 100 │ │ Tyneside Cinema │ 01113101 │ direct │ 100 │ │ Unfolding Theatre │ 06764666 │ direct │ 100 │ │ Unlock Music CIC │ 13277858 │ direct │ 100 │ │ Vane Contemporary Art Limited │ 04313545 │ direct │ 100 │ │ Woodhorn Charitable Trust │ 06893854 │ direct │ 100 │ │ Workie Ticket Theatre CIC │ 11780194 │ direct │ 100 │ │ YMCA North Tyneside │ 02703063 │ direct │ 100 │ │ tiny dragon Productions │ 13816200 │ direct │ 100 │ ├─────────────────────────────────────┴────────────────┴─────────┴───────┤ │ 125 rows (20 shown) 4 columns │ └────────────────────────────────────────────────────────────────────────┘
companies = etl.fromdataframe(
db.query('''
SELECT m.organisation as organisation,
c.CompanyName as registered_name,
c.CompanyNumber as company_number,
m.type,
m.score,
"URI" as uri,
"RegAddress.PostTown" as post_town,
"RegAddress.PostCode" as postcode,
CompanyCategory as company_category,
CompanyStatus as company_status,
[x for x in [
"SICCode.SicText_1",
"SICCode.SicText_2",
"SICCode.SicText_3",
"SICCode.SicText_4"
] if x is not NULL] as sic_code,
IncorporationDate as incorporation_date,
DissolutionDate as dissolution_date,
"Accounts.AccountCategory" as accounts_category
FROM tCompanies m
JOIN CompanyData c
ON m.company_number == c.CompanyNumber;
''').df())
company_by_sic = etl.fromdataframe(
db.query(f'''
SELECT DISTINCT c.*
FROM (
SELECT e.*
FROM tCompanyExtract e
LEFT JOIN (SELECT company_number FROM tCompanies) r
ON e.company_number == r.company_number
WHERE r.company_number IS NULL
) c
JOIN tSicCodes s
ON list_contains(c.sic_code, s.sic_code)
ORDER BY c.company_number;
''').df()
)
charities = etl.fromdataframe(
db.query('''
SELECT
l.organisation,
c.charity_name,
c.registered_charity_number,
c.charity_company_registration_number,
charity_contact_postcode,
charity_contact_web,
latest_income AS charity_latest_income,
latest_expenditure AS charity_latest_expenditure
FROM Charities c
JOIN tCharities l
ON c.registered_charity_number == l.registered_charity_number
''').df()
)
db.close()
company_data = companies.dictlookupone('organisation')
charity_data = charities.dictlookupone('organisation')
Create the new landscape table
landscape_matched = (
wide_table
.addfield('location', lambda r: location.get(r.organisation, ()))
.unpack('location', newfields=['latitude', 'longitude'])
.addfield('company_data', lambda r: company_data.get(r.organisation, {}))
.unpackdict('company_data', keys=[
'company_category',
'accounts_category',
'company_number',
'company_status',
'dissolution_date',
'incorporation_date',
'post_town',
'postcode',
'sic_code',
'uri',
'type', 'score'
])
.convert('postcode', normalise_postcode)
.addfield('charity_data', lambda r: charity_data.get(r.organisation, {}))
.unpackdict('charity_data', keys=[
'charity_name',
'registered_charity_number',
'charity_company_registration_number',
'charity_contact_postcode',
'charity_contact_web',
'charity_latest_expenditure',
'charity_latest_income',
])
)
landscape = (
etl
.cat(landscape_matched, company_by_sic)
.convert('postcode', lambda x: postcode_lookup.get(x, { 'pcds': x }))
.unpackdict('postcode', keys=['pcds', 'lat', 'long', 'oslaua'])
.convert('latitude', lambda x, r: r['lat'], pass_row=True, where=lambda r: r['latitude'] == None and r['lat'] != None)
.convert('longitude', lambda x, r: r['long'], pass_row=True, where=lambda r: r['longitude'] == None and r['long'] != None)
.cutout('lat', 'long')
.convert('organisation', lambda x, r: x or r.registered_name, pass_row=True)
.cutout('registered_name')
.convert('sic_code', list)
.sort('organisation')
)
landscape.selectnotnone('company_number').duplicates('company_number')
organisation | funding_geo_code | funding_geo_name | IPSO | NPO | Project Grant | latitude | longitude | company_category | accounts_category | company_number | company_status | dissolution_date | incorporation_date | post_town | sic_code | uri | type | score | charity_name | registered_charity_number | charity_company_registration_number | charity_contact_postcode | charity_contact_web | charity_latest_expenditure | charity_latest_income | pcds | oslaua |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Equal Arts | E08000021 | Newcastle upon Tyne | None | None | True | 54.974992 | -1.610437 | PRI/LTD BY GUAR/NSC (Private, limited by guarantee, no share capital) | TOTAL EXEMPTION FULL | 01992359 | Active | None | 1986-02-21 00:00:00 | NEWCASTLE UPON TYNE | ['90040 - Operation of arts facilities'] | http://business.data.gov.uk/id/company/01992359 | fuzzy | 95 | None | None | None | None | None | None | None | NE1 8AX | E08000021 |
Equal Arts | E08000037 | Gateshead | None | None | True | 54.974992 | -1.610437 | PRI/LTD BY GUAR/NSC (Private, limited by guarantee, no share capital) | TOTAL EXEMPTION FULL | 01992359 | Active | None | 1986-02-21 00:00:00 | NEWCASTLE UPON TYNE | ['90040 - Operation of arts facilities'] | http://business.data.gov.uk/id/company/01992359 | fuzzy | 95 | None | None | None | None | None | None | None | NE1 8AX | E08000021 |
International Guitar Foundation & Festivals | E08000021 | Newcastle upon Tyne | None | None | True | None | None | PRI/LTD BY GUAR/NSC (Private, limited by guarantee, no share capital) | MICRO ENTITY | 02932317 | Active | None | 1994-05-24 00:00:00 | LONDON | ['90030 - Artistic creation'] | http://business.data.gov.uk/id/company/02932317 | direct | 100 | None | None | None | None | None | None | None | N1 9AG | None |
International Guitar Foundation & Festivals | E08000022 | North Tyneside | None | None | True | None | None | PRI/LTD BY GUAR/NSC (Private, limited by guarantee, no share capital) | MICRO ENTITY | 02932317 | Active | None | 1994-05-24 00:00:00 | LONDON | ['90030 - Artistic creation'] | http://business.data.gov.uk/id/company/02932317 | direct | 100 | None | None | None | None | None | None | None | N1 9AG | None |
Vane Contemporary Art Limited | E08000021 | Newcastle upon Tyne | None | None | True | 54.965252 | -1.602409 | PRI/LTD BY GUAR/NSC (Private, limited by guarantee, no share capital) | MICRO ENTITY | 04313545 | Active | None | 2001-10-30 00:00:00 | GATESHEAD | ['90040 - Operation of arts facilities'] | http://business.data.gov.uk/id/company/04313545 | direct | 100 | None | None | None | None | None | None | None | NE8 2AP | E08000037 |
...
landscape.selectnotnone('registered_charity_number').select(lambda r: r.company_number != r.charity_company_registration_number)
organisation | funding_geo_code | funding_geo_name | IPSO | NPO | Project Grant | latitude | longitude | company_category | accounts_category | company_number | company_status | dissolution_date | incorporation_date | post_town | sic_code | uri | type | score | charity_name | registered_charity_number | charity_company_registration_number | charity_contact_postcode | charity_contact_web | charity_latest_expenditure | charity_latest_income | pcds | oslaua |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Sangini | E08000023 | South Tyneside | None | None | True | None | None | None | None | None | None | None | None | None | None | None | None | None | SANGINI | 1124769 | 01124769 | NE34 0RG | https://sanginiafriend.wordpress.com | 58142 | 20577 | None | None |
Finally, write the CSV file
landscape.tocsv(DATA / 'culture_landscape.csv')
landscape.cut('sic_code').selectnotnone('sic_code')
sic_code |
---|
['90030 - Artistic creation'] |
['70229 - Management consultancy activities other than financial management', '90020 - Support activities to performing arts'] |
['90020 - Support activities to performing arts'] |
['59111 - Motion picture production activities', '59112 - Video production activities', '74909 - Other professional, scientific and technical activities n.e.c.'] |
['58110 - Book publishing'] |
...