Skip to content

NPACore/nih-grants

Repository files navigation

NIH Funding 2024

Files

curated files include

data/ is large and not tracked. csv.gz are in LFS (git lfs pull) See also #Code section for more.

du -h *.csv.gz email/emails_*csv.gz data/{1985,2025}.pkl data/abstracts.csv.zip
sizefile
154Mgrants_PI-repeat_FY-1985:2026.csv.gz
19Memail/emails_FY2015:2022.csv.gz
110Mdata/1985.pkl
659Mdata/2025.pkl
962Mdata/abstracts.csv.zip

Notes

Avaliable data

Each grant has a nested data structure

{
│   'appl_id': 11142308,
│   'subproject_id': None,
│   'fiscal_year': 2025,
│   'project_num': '5F31ES036890-02',
│   'project_serial_num': 'ES036890',
│   'organization': {
│   │   'org_name': 'UNIVERSITY OF ALABAM'+15,
│   │   'city': None,
│   │   'country': None,
│   │   'org_city': 'BIRMINGHAM',
│   │   'org_country': 'UNITED STATES',
│   │   'org_state': 'AL',
│   │   'org_state_name': None,
│   │   'dept_type': 'NEUROLOGY',
│   │   'fips_country_code': None,
│   │   'org_duns': ['063690705'],
│   │   'org_ueis': ['YND4PLMC9AN7'],
│   │   'primary_duns': '063690705',
│   │   'primary_uei': 'YND4PLMC9AN7',
│   │   'org_fips': 'US',
│   │   'org_ipf_code': '1288803',
│   │   'org_zipcode': '352940001',
│   │   'external_org_id': 1288803
│   },
│   'award_type': '5',
│   'activity_code': 'F31',
│   'award_amount': 40828,
│   'is_active': True,
│   'project_num_split': {
│   │   'appl_type_code': '5',
│   │   'activity_code': 'F31',
│   │   'ic_code': 'ES',
│   │   'serial_num': '036890',
│   │   'support_year': '02',
│   │   'full_support_year': '02',
│   │   'suffix_code': ''
│   },
│   'principal_investigat'+3: [
│   │   {
│   │   │   'profile_id': 78314578,
│   │   │   'first_name': 'CENSORED',
│   │   │   'middle_name': '',
│   │   │   'last_name': 'CENSORED',
│   │   │   'is_contact_pi': True,
│   │   │   'full_name': 'CENSORED',
│   │   │   'title': 'GRADUATE STUDENT TRA'+4
│   │   }
│   ],
│   'contact_pi_name': 'ADAMSON, ASHLEY ',
│   'program_officers': [
│   │   {
│   │   │   'first_name': 'CENSORED',
│   │   │   'middle_name': '',
│   │   │   'last_name': 'CENSORED',
│   │   │   'full_name': 'CENSORED'
│   │   }
│   ],
│   'agency_ic_admin': {
│   │   'code': 'ES',
│   │   'abbreviation': 'NIEHS',
│   │   'name': 'National Institute o'+31
│   },
│   'agency_ic_fundings': [
│   │   {
│   │   │   'fy': 2025,
│   │   │   'code': 'ES',
│   │   │   'name': 'National Institute o'+31,
│   │   │   'abbreviation': 'NIEHS',
│   │   │   'total_cost': 40828.0,
│   │   │   'direct_cost_ic': 40828.0,
│   │   │   'indirect_cost_ic': 0.0
│   │   }
│   ],
│   'cong_dist': 'AL-07',
│   'spending_categories': None,
│   'project_start_date': '2024-08-01T00:00:00',
│   'project_end_date': '2027-07-31T00:00:00',
│   'organization_type': {
│   │   'name': 'SCHOOLS OF MEDICINE',
│   │   'code': '10',
│   │   'is_other': False
│   },
│   'geo_lat_lon': {'lon': -86.799772, 'lat': 33.50591},
│   'opportunity_number': 'PA-23-271',
│   'full_study_section': {
│   │   'srg_code': 'ZRG1',
│   │   'srg_flex': None,
│   │   'sra_designator_code': 'F03C',
│   │   'sra_flex_code': 'K',
│   │   'group_code': '20',
│   │   'name': 'Special Emphasis Pan'+19
│   },
│   'award_notice_date': '2025-07-28T00:00:00',
│   'is_new': False,
│   'mechanism_code_dc': 'TR',
│   'core_project_num': 'F31ES036890',
│   'terms': '<21+ years old><Absc'+6443,
│   'pref_terms': 'Adult;Aging;Air;Area'+1882,
│   'abstract_text': 'Project Summary\nThe '+2798,
│   'project_title': 'The role for cell cy'+64,
│   'phr_text': 'Project Narrative\nTh'+811,
│   'spending_categories_'+4: None,
│   'agency_code': 'NIH',
│   'covid_response': None,
│   'arra_funded': 'N',
│   'budget_start': '2025-08-01T00:00:00',
│   'budget_end': '2026-07-31T00:00:00',
│   'cfda_code': '93.113',
│   'funding_mechanism': 'Training, Individual',
│   'direct_cost_amt': 40828,
│   'indirect_cost_amt': 0,
│   'project_detail_url': 'https://reporter.nih'+29,
│   'date_added': '2025-08-02T16:13:37'
}

Extracted

The grants_PI-repeat_FY-2001:2025.csv.gz export has a subset of the data flattened and repeated for each PI.

proj |> ungroup() |>
  summarise(n_projects=n(),
            n_organizatoins=length(unique(org)),
            PIperProj_mean = mean(n_pi),
            across(c(direct=direct_cost_amt,
                     indirect=indirect_cost_amt,
                     awarded=award_amount),
                   c(total=\(x) sum(x,na.rm=T),
                     median=\(x) median(x, na.rm=T)))) |>
  fmt() |>
  t()
n_projects79,374
n_organizatoins2,888
PIperProj_mean2.6
direct_total29,847,146,202
direct_median256,779
indirect_total10,805,710,856
indirect_median112,671
awarded_total42,169,253,525
awarded_median388,750
proj |> filter(letter %in% c('R','K','F')) |>
ggplot() +
  aes(x=direct_cost_amt, fill=letter) +
  geom_histogram(alpha=.7, position='dodge') +
  #geom_density(alpha=.7) +
  scale_x_continuous(trans='log10', limits=c(5000,NA)) +
  cowplot::theme_cowplot() +
  #facet_wrap(letter~.) +
  labs(fill="grant", x="direct cost (log)", title="distribrution of award by type")

img/proj_hist.png

Study sections

Popular study sections

duckdb -csv -separator $'\t' data/abstracts.csv "
  select srg,
         count(*) as n_grants,
         min(year) as firstyear,
         max(year) as lastyear
  from abstracts
  group by srg
  order by n desc
  limit 10;"
srgnfirstyearlastyear
NCI5980519962025
SRC4382119852014
NSS4018519852025
RIRG3393119962016
DDK2434319872025
HLBP1610519942025
STC1521619851997
CHHD1272419972025
SSS1207419852000
SAT1138819852025

Sections matching fMRI

duckdb -csv -separator $'\t' data/abstracts.csv "
 select
  count(*) as n_grants, srg, section
  from abstracts where abstract ilike '%fMRI%'
  group by section, srg order by n_grants desc limit 10;"
n_grantssrgsection
1081NPASNeural Basis of Psychopathology, Addictions and Sleep Disorders Study Section
759APDAAdult Psychopathology and Disorders of Aging Study Section
660CPDDChild Psychopathology and Developmental Disabilities Study Section
610CPCognition and Perception Study Section
378LCOMLanguage and Communication Study Section
309NSSNSS
277MESHBiobehavioral Mechanisms of Emotion, Stress and Health Study Section
250RIRGClinical Research Review Committee
238NIDACareer Development Education and Training Study Section
210COGCognitive Neuroscience Study Section

Top

Institution

proj_org_smry <- proj |>
  group_by(org) |>
  summarise(
    amount=sum(award_amount),
    n_proj=n(),
    n_R=length(which(letter=="R")),
    n_K=length(which(letter=="K")),
    mean_n_pi = mean(n_pi),
    median_amount = median(award_amount),
    direct_cost=sum(direct_cost_amt,na.rm=T),
    indirect_cost=sum(indirect_cost_amt,na.rm=T))

proj_org_smry |> arrange(-amount) |> head() |>fmt()
orgamountn_projn_Rn_Kmean_n_pimedian_amountdirect_costindirect_cost
1JOHNS HOPKINS UNIVERSITY967,554,6201,8269221912.8404,828716,140,229267,819,162
2UNIVERSITY OF CALIFORNIA, SAN FRANCISCO923,404,3911,7418892322.5395,536680,359,737245,279,242
3WASHINGTON UNIVERSITY901,899,9061,4558051233.1393,750679,451,125223,127,797
4UNIVERSITY OF MICHIGAN AT ANN ARBOR840,742,0851,6639651552.7388,052613,767,433228,042,377
5UNIVERSITY OF PENNSYLVANIA790,934,5801,5578241322.6406,043560,715,831235,792,132
6UNIVERSITY OF PITTSBURGH AT PITTSBURGH747,102,1721,4358581142.7400,107542,554,963209,616,818

Contact PI

proj_pi_smry <- proj |>
  group_by(contact_pi) |>
  # remove first and middle name
  mutate(contact_pi=gsub(':.* ',':', toupper(contact_pi))) |>
  summarise(
    amount=sum(award_amount),
    direct=sum(direct_cost_amt,na.rm=T),
    n_proj=n(),
    mean_n_pi = mean(n_pi),
    median_amount = median(award_amount),
    org=substr(paste(collapse=";", gsub('(UNIVERSITY|OF|SCHOOL|INSTITUTE) ?','', unique(org))),0,100))
proj_pi_smry |> filter(amount>100) |>
  mutate(projects=cut(n_proj, breaks=c(0,1,2,5,10,50,Inf)),
         pis=cut(mean_n_pi, breaks=c(0,1,2,5,10,50))) |>
  ggplot() +
  aes(x=amount, fill=pis) +
  geom_histogram() +
  scale_x_continuous(trans='log10') +
  cowplot::theme_cowplot() +
  labs(fill="mean N co-pi", title="Amount per contact-PI")

img/pi.png

summary(proj_pi_smry$amount)
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's
        1    211712    462500    851443    858892 341743406      1620

By total amount

proj_pi_smry |>
  arrange(-amount) |>
  head() |> fmt()
contact_piamountn_projmean_n_pimedian_amountorg
179478801:BRISCOE341,743,406431.01,117,108LEIDOS BIOMEDICAL RESEARCH, INC.
210753426:NOLEN289,804,00021.5144,902,000RESEARCH TRIANGLE
310829359:GROSS110,114,21724.055,057,108NEW YORK MEDICINE
478492086:MONTALVAN66,600,00021.033,300,000WESTAT, INC.
51882258:BOXER61,341,88778.3984,055CALIFORNIA, SAN FRANCISCO;MAYO CLINIC ROCHESTER
66190835:DIAMOND60,794,967125.5765,900STANFORD ;MARYLAND BALTIMORE;WASHINGTON ;PITTSBURGH AT PITTSBURGH

By number of projects

proj_pi_smry |>
  filter(contact_pi!="NONE:") |>
  arrange(-n_proj) |>
  head() |>fmt()
contact_piamountn_projmean_n_pimedian_amountorg
179478801:BRISCOE341,743,4064311,117,108LEIDOS BIOMEDICAL RESEARCH, INC.
21891624:EBERLEIN13,217,1713023258,998WASHINGTON
38497898:SHEPPARD359,30028114,250KEYSTONE SYMPOSIA
47039607:STEWART2,027,36025140,000COLD SPRING HARBOR LABORATORY
579112606:FREEDMAN46,535,946251225,042LEIDOS BIOMEDICAL RESEARCH, INC.
66774622:PASCHE4,805,560232150,162WAYNE STATE ;WAKE FOREST HEALTH SCIENCES

Code

  • Makefile will use get_grants.py to pull data/*.pkl grant information per year.
  • get_grants.py Uses the NIH reporter’s api, but goes by state (and DC + PR) to avoid return limits. This saves all output (>700Mb!) to a pickle file.
  • grants_to_csv.py parses the pickle to csv.
  • get_abstracts.py will extract the abstrats (data/abstracts.csv.zip).

About

No description, website, or topics provided.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published