Some OpenAddresses machine stats

I’ve collected the state.txt files written by OpenAddress machine and stuffed them in a database for ad hoc reporting. Here’s some queries and results

Slowest processing

select source, max(address_count), round(avg(process_time)) as time
from stats where address_count > 0
group by source order by time desc limit 10;

source                          max(addres  time
------------------------------  ----------  ----------
nl.json                         14846597    8769.0
au-victoria.json                3426983     8438.0
es-25830.json                   8778892     6450.0
us-va.json                      3504160     4377.0
dk.json                         3450853     4220.0
pl-mazowieckie.json             995946      4108.0
us-ny.json                      4229542     4047.0
pl-lodzkie.json                 685093      3575.0
pl-slaskie.json                 575596      3331.0
pl-malopolski.json              670892      3216.0

Slowest cache

source                          max(addres  time
------------------------------  ----------  ----------
us-ny.json                      4229542     5808.0
za-nl-ethekwini.json            478939      3617.0
us-va-city_of_chesapeake.json   99197       3457.0
us-ca-marin_county.json         104067      3034.0
us-mi-ottawa.json               109202      2232.0
us-fl-lee.json                  548264      1999.0
us-ne-lancaster.json            97859       1995.0
us-ia-linn.json                 95928       1519.0
dk.json                         3450853     1348.0
us-az-mesa.json                 331816      890.0

Summary of runs

For each run, print out how many sources we tried, how many cached, how many sampled, and how many processed. The counting here is a bit bogus but I think close.

select ts, count(distinct(source)), count(distinct(cache_url)), count(distinct(sample_url)), count(distinct(processed_url))
from stats group by ts;
ts               addresses        source  caches  sample  proces
---------------  ---------------  ------  ------  ------  ------
1420182459                        737     689     557     490
1420528178                        738     691     559     490
1420787323                        740     692     560     495
1421133148                        789     717     558     521
1421392045                        790     718     559     523
1421737732                        790     719     557     521
1421996867                        790     719     558     522
1422342479                        790     718     560     524
1422601615                        790     652     582     503
1422645440                        790     655     585     511
1422748762                        790     650     580     504
1422773454                        790     647     577     506
1422860989                        790     653     582     508
1423034051                        790     658     588     509
1423206384                        790     658     587     508
1423465798                        790     660     588     510
1423638409                        790     659     588     508
1423811369                        790     661     590     508
1424070358                        790     657     586     509
1424243321                        790     645     566     499
1424416039                        790     650     534     469
1424675318        88031206        790     648     563     496
1424849197       111999281        794     658     570     505
1425022188       105056481        794     653     570     508
1425280104       109864743        794     651     567     507
1425453036       111734276        786     655     573     520
1425625770       116323932        786     656     571     519
1425881358       113713353        786     644     563     528
1426054556       117771563        788     666     586     534
1426228915       117335107        788     666     584     538
1427090950       113916633        788     665     581     527

Time to complete successful runs

 select ts, round(avg(cache_time)) as cache, round(avg(process_time)) as process 
 from stats where length(processed_url) > 1 
 group by ts order by ts;
ts               cache            proces
---------------  ---------------  ------
1420182459       63.0             154.0
1420528178       50.0             147.0
1420787323       46.0             166.0
1421133148       43.0             159.0
1421392045       43.0             160.0
1421737732       43.0             163.0
1421996867       45.0             167.0
1422342479       46.0             164.0
1422601615       48.0             228.0
1422645440       56.0             314.0
1422748762       47.0             300.0
1422773454       50.0             289.0
1422860989       61.0             296.0
1423034051       66.0             263.0
1423206384       44.0             260.0
1423465798       51.0             294.0
1423638409       51.0             270.0
1423811369       50.0             260.0
1424070358       55.0             277.0
1424243321       60.0             265.0
1424416039       57.0             298.0
1424675318       51.0             265.0
1424849197       62.0             323.0
1425022188       53.0             284.0
1425280104       59.0             312.0
1425453036       55.0             325.0
1425625770       79.0             367.0
1425881358       77.0             315.0
1426054556       84.0             335.0
1426228915       76.0             361.0
1427090950       71.0             272.0

Most improved

A count of which sources have the most variance in how many address lines we got out. kinda bogus, but fun.

select source, max(address_count)-min(address_count) as diff, min(address_count) as minc, max(address_count) as maxc
from stats where address_count > 0 
group by source order by diff desc limit 15;

source                          diff        minc        maxc
------------------------------  ----------  ----------  ----------
us-mn-dakota.json               138301      27218       165519
us-pa-allegheny.json            53399       474492      527891
us-ne-lancaster.json            44996       52863       97859
us-va-salem.json                9992        614         10606
dk.json                         5258        3445595     3450853
be-flanders.json                2809        3365557     3368366
us-nc-mecklenburg.json          2117        499524      501641
us-co-larimer.json              1118        169500      170618
ca-on-hamilton.json             1112        243554      244666
us-va-james_city.json           1071        33383       34454
us-tx-austin.json               837         374854      375691
us-or-portland.json             786         415220      416006
us-nc-buncombe.json             784         147492      148276
us-nc-union.json                720         81241       81961
us-oh-williams.json             603         18496       19099