curated files include
- grants_PI-repeat_FY-1985:2026.csv.gz - row per PI, repeated grants, see get_grants.py and grants_to_csv.py
- email/emails_FY2015:2022.csv.gz - coalesced FoI provided emails, see email/00.2_combine_emails.R
data/ is large and not tracked. csv.gz are in LFS (git lfs pull) See also #Code section for more.
du -h *.csv.gz email/emails_*csv.gz data/{1985,2025}.pkl data/abstracts.csv.zip| size | file |
|---|---|
| 154M | grants_PI-repeat_FY-1985:2026.csv.gz |
| 19M | email/emails_FY2015:2022.csv.gz |
| 110M | data/1985.pkl |
| 659M | data/2025.pkl |
| 962M | data/abstracts.csv.zip |
Each grant has a nested data structure
{
│ 'appl_id': 11142308,
│ 'subproject_id': None,
│ 'fiscal_year': 2025,
│ 'project_num': '5F31ES036890-02',
│ 'project_serial_num': 'ES036890',
│ 'organization': {
│ │ 'org_name': 'UNIVERSITY OF ALABAM'+15,
│ │ 'city': None,
│ │ 'country': None,
│ │ 'org_city': 'BIRMINGHAM',
│ │ 'org_country': 'UNITED STATES',
│ │ 'org_state': 'AL',
│ │ 'org_state_name': None,
│ │ 'dept_type': 'NEUROLOGY',
│ │ 'fips_country_code': None,
│ │ 'org_duns': ['063690705'],
│ │ 'org_ueis': ['YND4PLMC9AN7'],
│ │ 'primary_duns': '063690705',
│ │ 'primary_uei': 'YND4PLMC9AN7',
│ │ 'org_fips': 'US',
│ │ 'org_ipf_code': '1288803',
│ │ 'org_zipcode': '352940001',
│ │ 'external_org_id': 1288803
│ },
│ 'award_type': '5',
│ 'activity_code': 'F31',
│ 'award_amount': 40828,
│ 'is_active': True,
│ 'project_num_split': {
│ │ 'appl_type_code': '5',
│ │ 'activity_code': 'F31',
│ │ 'ic_code': 'ES',
│ │ 'serial_num': '036890',
│ │ 'support_year': '02',
│ │ 'full_support_year': '02',
│ │ 'suffix_code': ''
│ },
│ 'principal_investigat'+3: [
│ │ {
│ │ │ 'profile_id': 78314578,
│ │ │ 'first_name': 'CENSORED',
│ │ │ 'middle_name': '',
│ │ │ 'last_name': 'CENSORED',
│ │ │ 'is_contact_pi': True,
│ │ │ 'full_name': 'CENSORED',
│ │ │ 'title': 'GRADUATE STUDENT TRA'+4
│ │ }
│ ],
│ 'contact_pi_name': 'ADAMSON, ASHLEY ',
│ 'program_officers': [
│ │ {
│ │ │ 'first_name': 'CENSORED',
│ │ │ 'middle_name': '',
│ │ │ 'last_name': 'CENSORED',
│ │ │ 'full_name': 'CENSORED'
│ │ }
│ ],
│ 'agency_ic_admin': {
│ │ 'code': 'ES',
│ │ 'abbreviation': 'NIEHS',
│ │ 'name': 'National Institute o'+31
│ },
│ 'agency_ic_fundings': [
│ │ {
│ │ │ 'fy': 2025,
│ │ │ 'code': 'ES',
│ │ │ 'name': 'National Institute o'+31,
│ │ │ 'abbreviation': 'NIEHS',
│ │ │ 'total_cost': 40828.0,
│ │ │ 'direct_cost_ic': 40828.0,
│ │ │ 'indirect_cost_ic': 0.0
│ │ }
│ ],
│ 'cong_dist': 'AL-07',
│ 'spending_categories': None,
│ 'project_start_date': '2024-08-01T00:00:00',
│ 'project_end_date': '2027-07-31T00:00:00',
│ 'organization_type': {
│ │ 'name': 'SCHOOLS OF MEDICINE',
│ │ 'code': '10',
│ │ 'is_other': False
│ },
│ 'geo_lat_lon': {'lon': -86.799772, 'lat': 33.50591},
│ 'opportunity_number': 'PA-23-271',
│ 'full_study_section': {
│ │ 'srg_code': 'ZRG1',
│ │ 'srg_flex': None,
│ │ 'sra_designator_code': 'F03C',
│ │ 'sra_flex_code': 'K',
│ │ 'group_code': '20',
│ │ 'name': 'Special Emphasis Pan'+19
│ },
│ 'award_notice_date': '2025-07-28T00:00:00',
│ 'is_new': False,
│ 'mechanism_code_dc': 'TR',
│ 'core_project_num': 'F31ES036890',
│ 'terms': '<21+ years old><Absc'+6443,
│ 'pref_terms': 'Adult;Aging;Air;Area'+1882,
│ 'abstract_text': 'Project Summary\nThe '+2798,
│ 'project_title': 'The role for cell cy'+64,
│ 'phr_text': 'Project Narrative\nTh'+811,
│ 'spending_categories_'+4: None,
│ 'agency_code': 'NIH',
│ 'covid_response': None,
│ 'arra_funded': 'N',
│ 'budget_start': '2025-08-01T00:00:00',
│ 'budget_end': '2026-07-31T00:00:00',
│ 'cfda_code': '93.113',
│ 'funding_mechanism': 'Training, Individual',
│ 'direct_cost_amt': 40828,
│ 'indirect_cost_amt': 0,
│ 'project_detail_url': 'https://reporter.nih'+29,
│ 'date_added': '2025-08-02T16:13:37'
}
The grants_PI-repeat_FY-2001:2025.csv.gz export has a subset of the data flattened and repeated for each PI.
proj |> ungroup() |>
summarise(n_projects=n(),
n_organizatoins=length(unique(org)),
PIperProj_mean = mean(n_pi),
across(c(direct=direct_cost_amt,
indirect=indirect_cost_amt,
awarded=award_amount),
c(total=\(x) sum(x,na.rm=T),
median=\(x) median(x, na.rm=T)))) |>
fmt() |>
t()| n_projects | 79,374 |
| n_organizatoins | 2,888 |
| PIperProj_mean | 2.6 |
| direct_total | 29,847,146,202 |
| direct_median | 256,779 |
| indirect_total | 10,805,710,856 |
| indirect_median | 112,671 |
| awarded_total | 42,169,253,525 |
| awarded_median | 388,750 |
proj |> filter(letter %in% c('R','K','F')) |>
ggplot() +
aes(x=direct_cost_amt, fill=letter) +
geom_histogram(alpha=.7, position='dodge') +
#geom_density(alpha=.7) +
scale_x_continuous(trans='log10', limits=c(5000,NA)) +
cowplot::theme_cowplot() +
#facet_wrap(letter~.) +
labs(fill="grant", x="direct cost (log)", title="distribrution of award by type")Popular study sections
duckdb -csv -separator $'\t' data/abstracts.csv "
select srg,
count(*) as n_grants,
min(year) as firstyear,
max(year) as lastyear
from abstracts
group by srg
order by n desc
limit 10;"| srg | n | firstyear | lastyear |
|---|---|---|---|
| NCI | 59805 | 1996 | 2025 |
| SRC | 43821 | 1985 | 2014 |
| NSS | 40185 | 1985 | 2025 |
| RIRG | 33931 | 1996 | 2016 |
| DDK | 24343 | 1987 | 2025 |
| HLBP | 16105 | 1994 | 2025 |
| STC | 15216 | 1985 | 1997 |
| CHHD | 12724 | 1997 | 2025 |
| SSS | 12074 | 1985 | 2000 |
| SAT | 11388 | 1985 | 2025 |
Sections matching fMRI
duckdb -csv -separator $'\t' data/abstracts.csv "
select
count(*) as n_grants, srg, section
from abstracts where abstract ilike '%fMRI%'
group by section, srg order by n_grants desc limit 10;"| n_grants | srg | section |
|---|---|---|
| 1081 | NPAS | Neural Basis of Psychopathology, Addictions and Sleep Disorders Study Section |
| 759 | APDA | Adult Psychopathology and Disorders of Aging Study Section |
| 660 | CPDD | Child Psychopathology and Developmental Disabilities Study Section |
| 610 | CP | Cognition and Perception Study Section |
| 378 | LCOM | Language and Communication Study Section |
| 309 | NSS | NSS |
| 277 | MESH | Biobehavioral Mechanisms of Emotion, Stress and Health Study Section |
| 250 | RIRG | Clinical Research Review Committee |
| 238 | NIDA | Career Development Education and Training Study Section |
| 210 | COG | Cognitive Neuroscience Study Section |
proj_org_smry <- proj |>
group_by(org) |>
summarise(
amount=sum(award_amount),
n_proj=n(),
n_R=length(which(letter=="R")),
n_K=length(which(letter=="K")),
mean_n_pi = mean(n_pi),
median_amount = median(award_amount),
direct_cost=sum(direct_cost_amt,na.rm=T),
indirect_cost=sum(indirect_cost_amt,na.rm=T))
proj_org_smry |> arrange(-amount) |> head() |>fmt()| org | amount | n_proj | n_R | n_K | mean_n_pi | median_amount | direct_cost | indirect_cost | |
|---|---|---|---|---|---|---|---|---|---|
| 1 | JOHNS HOPKINS UNIVERSITY | 967,554,620 | 1,826 | 922 | 191 | 2.8 | 404,828 | 716,140,229 | 267,819,162 |
| 2 | UNIVERSITY OF CALIFORNIA, SAN FRANCISCO | 923,404,391 | 1,741 | 889 | 232 | 2.5 | 395,536 | 680,359,737 | 245,279,242 |
| 3 | WASHINGTON UNIVERSITY | 901,899,906 | 1,455 | 805 | 123 | 3.1 | 393,750 | 679,451,125 | 223,127,797 |
| 4 | UNIVERSITY OF MICHIGAN AT ANN ARBOR | 840,742,085 | 1,663 | 965 | 155 | 2.7 | 388,052 | 613,767,433 | 228,042,377 |
| 5 | UNIVERSITY OF PENNSYLVANIA | 790,934,580 | 1,557 | 824 | 132 | 2.6 | 406,043 | 560,715,831 | 235,792,132 |
| 6 | UNIVERSITY OF PITTSBURGH AT PITTSBURGH | 747,102,172 | 1,435 | 858 | 114 | 2.7 | 400,107 | 542,554,963 | 209,616,818 |
proj_pi_smry <- proj |>
group_by(contact_pi) |>
# remove first and middle name
mutate(contact_pi=gsub(':.* ',':', toupper(contact_pi))) |>
summarise(
amount=sum(award_amount),
direct=sum(direct_cost_amt,na.rm=T),
n_proj=n(),
mean_n_pi = mean(n_pi),
median_amount = median(award_amount),
org=substr(paste(collapse=";", gsub('(UNIVERSITY|OF|SCHOOL|INSTITUTE) ?','', unique(org))),0,100))proj_pi_smry |> filter(amount>100) |>
mutate(projects=cut(n_proj, breaks=c(0,1,2,5,10,50,Inf)),
pis=cut(mean_n_pi, breaks=c(0,1,2,5,10,50))) |>
ggplot() +
aes(x=amount, fill=pis) +
geom_histogram() +
scale_x_continuous(trans='log10') +
cowplot::theme_cowplot() +
labs(fill="mean N co-pi", title="Amount per contact-PI")summary(proj_pi_smry$amount) Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
1 211712 462500 851443 858892 341743406 1620
By total amount
proj_pi_smry |>
arrange(-amount) |>
head() |> fmt()| contact_pi | amount | n_proj | mean_n_pi | median_amount | org | |
|---|---|---|---|---|---|---|
| 1 | 79478801:BRISCOE | 341,743,406 | 43 | 1.0 | 1,117,108 | LEIDOS BIOMEDICAL RESEARCH, INC. |
| 2 | 10753426:NOLEN | 289,804,000 | 2 | 1.5 | 144,902,000 | RESEARCH TRIANGLE |
| 3 | 10829359:GROSS | 110,114,217 | 2 | 4.0 | 55,057,108 | NEW YORK MEDICINE |
| 4 | 78492086:MONTALVAN | 66,600,000 | 2 | 1.0 | 33,300,000 | WESTAT, INC. |
| 5 | 1882258:BOXER | 61,341,887 | 7 | 8.3 | 984,055 | CALIFORNIA, SAN FRANCISCO;MAYO CLINIC ROCHESTER |
| 6 | 6190835:DIAMOND | 60,794,967 | 12 | 5.5 | 765,900 | STANFORD ;MARYLAND BALTIMORE;WASHINGTON ;PITTSBURGH AT PITTSBURGH |
By number of projects
proj_pi_smry |>
filter(contact_pi!="NONE:") |>
arrange(-n_proj) |>
head() |>fmt()| contact_pi | amount | n_proj | mean_n_pi | median_amount | org | |
|---|---|---|---|---|---|---|
| 1 | 79478801:BRISCOE | 341,743,406 | 43 | 1 | 1,117,108 | LEIDOS BIOMEDICAL RESEARCH, INC. |
| 2 | 1891624:EBERLEIN | 13,217,171 | 30 | 23 | 258,998 | WASHINGTON |
| 3 | 8497898:SHEPPARD | 359,300 | 28 | 1 | 14,250 | KEYSTONE SYMPOSIA |
| 4 | 7039607:STEWART | 2,027,360 | 25 | 1 | 40,000 | COLD SPRING HARBOR LABORATORY |
| 5 | 79112606:FREEDMAN | 46,535,946 | 25 | 1 | 225,042 | LEIDOS BIOMEDICAL RESEARCH, INC. |
| 6 | 6774622:PASCHE | 4,805,560 | 23 | 21 | 50,162 | WAYNE STATE ;WAKE FOREST HEALTH SCIENCES |
- Makefile will use get_grants.py to pull
data/*.pklgrant information per year. - get_grants.py Uses the NIH reporter’s api, but goes by state (and DC + PR) to avoid return limits. This saves all output (>700Mb!) to a pickle file.
- grants_to_csv.py parses the pickle to csv.
- get_abstracts.py will extract the abstrats (
data/abstracts.csv.zip).

