#!/usr/bin/python3
#
# Apply various estimates of the age-stratified Infection Fatality Ratio of COVID-19 to
# countries' population pyramids in order to calculate their overall IFR.
# Author: Marc Bevand — @zorinaq

import pandas as pd

# Pyramid data is from the United Nations: this file is a CSV export of the first sheet
# of "Population by Age Groups - Both Sexes" linked from:
# https://population.un.org/wpp/Download/Standard/Population/
# Direct link:
# https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/EXCEL_FILES/1_Population/WPP2019_POP_F07_1_POPULATION_BY_AGE_BOTH_SEXES.xlsx
file_pyramids = 'WPP2019_POP_F07_1_POPULATION_BY_AGE_BOTH_SEXES.csv'

maxage = 100

# Age groups defined in the CSV file
age_groups = [(0,4), (5,9), (10,14), (15,19), (20,24), (25,29), (30,34), (35,39), (40,44), (45,49), (50,54), (55,59), (60,64), (65,69), (70,74), (75,79), (80,84), (85,89), (90,94), (95,99), (100,maxage)]

# This will hold parsed pyramid data. Example to get the number of people in the
# age group 20-24 in France: pyramid['France'][(20,24)]
pyramid = {}

# Various age-stratified IFR estimates
ifrs = [

        # Calculated from Spanish ENE-COVID study
        # (see calc_ifr.py)
        ('ENE-COV', {
            (0,9):    0.003,
            (10,19):  0.004,
            (20,29):  0.015,
            (30,39):  0.030,
            (40,49):  0.064,
            (50,59):  0.213,
            (60,69):  0.718,
            (70,79):  2.384,
            (80,89):  8.466,
            (90,maxage): 12.497,
        }),

        # US CDC estimate as of 10 Sep 2020
        # https://www.cdc.gov/coronavirus/2019-ncov/hcp/planning-scenarios.html
        # (table 1)
        ('US_CDC', {
            (0,19):   0.003,
            (20,49):  0.02,
            (50,69):  0.5,
            (70,maxage): 5.4,
        }),

        # Verity et al.
        # https://www.thelancet.com/journals/laninf/article/PIIS1473-3099(20)30243-7/fulltext
        # (table 1)
        ('Verity', {
            (0,9):    0.00161,
            (10,19):  0.00695,
            (20,29):  0.0309,
            (30,39):  0.0844,
            (40,49):  0.161,
            (50,59):  0.595,
            (60,69):  1.93,
            (70,79):  4.28,
            (80,maxage): 7.80,
        }),

        # Levin et al.
        # https://www.medrxiv.org/content/10.1101/2020.07.23.20160895v5
        # (table 3)
        ('Levin', {
            (0,34):   0.004,
            (35,44):  0.06,
            (45,54):  0.2,
            (55,64):  0.7,
            (65,74):  2.3,
            (75,84):  7.6,
            (85,maxage): 22.3,
        }),

        # Gudbjartsson et al., Humoral Immune Response to SARS-CoV-2 in Iceland
        # https://www.nejm.org/doi/full/10.1056/NEJMoa2026116
        # Supplementary Appendix 1
        # https://www.nejm.org/doi/suppl/10.1056/NEJMoa2026116/suppl_file/nejmoa2026116_appendix_1.pdf
        # (table S7)
        ('Gudbj', {
            (0,70):   0.1,
            (71,80):  2.4,
            (81,maxage): 11.2,
        }),

        # O’Driscoll et al., Age-specific mortality and immunity patterns of SARS-CoV-2 infection in 45 countries
        # https://www.medrxiv.org/content/10.1101/2020.08.24.20180851v1
        # (table S4)
        ("O'Drisc", {
            (0,4):   0.002,
            (5,9):   0.000,
            (10,14): 0.000,
            (15,19): 0.002,
            (20,24): 0.004,
            (25,29): 0.009,
            (30,34): 0.017,
            (35,39): 0.029,
            (40,44): 0.053,
            (45,49): 0.086,
            (50,54): 0.154,
            (55,59): 0.241,
            (60,64): 0.359,
            (65,69): 0.642,
            (70,74): 1.076,
            (75,79): 2.276,
            (80,maxage): 7.274,
        }),

]

def ag2str(age_group):
    if age_group[1] == maxage:
        return f'{age_group[0]}+'
    return f'{age_group[0]}-{age_group[1]}'

def parse_pyramids():
    df = pd.read_csv(file_pyramids)
    # ignore labels as they don't contain any data
    df = df[df['Type'] != 'Label/Separator']
    # only take rows with data as of 2020
    df = df[df['Reference date (as of 1 July)'] == 2020]
    # only parse countries, world, and continents
    df = df[df['Type'].isin(('Country/Area', 'World', 'Region'))]
    # remove spaces used as thousands separators, and convert cell values to floats
    columns = [ag2str(x) for x in age_groups]
    for col in columns:
        df[col] = df[col].str.replace('\s+', '').astype(float)
    regions = list(df['Region, subregion, country or area *'])
    #regions = ('France',)
    for region in regions:
        pyramid[region] = {}
        df_region = df[df['Region, subregion, country or area *'] == region]
        for ag in age_groups:
            # values are in thousands
            pyramid[region][ag] = 1000 * float(df_region[ag2str(ag)])

def people_of_age(pyramid_region, age):
    # Returns the number of people of exact age 'age', given the provided age pyramid
    for ((a, b), n) in pyramid_region.items():
        if age in range(a, b + 1):
            return n / float(b - a + 1)

def overall_ifr(pyramid_region, ifr_age_stratified):
    pop = 0
    deaths = 0
    for (age_group, ifr) in ifr_age_stratified.items():
        for age in range(age_group[0], age_group[1] + 1):
            pop += people_of_age(pyramid_region, age)
            deaths += people_of_age(pyramid_region, age) * ifr / 100.0
    assert pop == sum(pyramid_region.values())
    return 100.0 * deaths / pop

def calc_overall_ifrs():
    oifrs = []
    for region in pyramid.keys():
        # The overall IFRs are appended to the array in the same order as listed in ifrs
        tmp = []
        for i in ifrs:
            ifr_age_stratified = i[1]
            # calculate the overall IFR for region, using IFR estimate ifr_age_stratified
            tmp.append(overall_ifr(pyramid[region], ifr_age_stratified))
        oifrs.append((region, *tmp))
    return oifrs

def show_overall_ifrs(oifrs):
    def header():
        for i in ifrs:
            print(f'| {i[0]:>7} ', end='')
        print('| Region |')
    # Each entry in the oifrs array is a tuple:
    # (<region_name>, <ifr_according_to_1st_estimate>, <ifr_according_to_2nd_estimate>, ...)
    # Sort by element index 1, that is by <ifr_according_to_1st_estimate>
    # To sort by region name, use index 0 (x[0])
    oifrs.sort(key=lambda x: x[1], reverse=True)
    header()
    for region in oifrs:
        for i in region[1:]:
            print(f'| {i:7.3f} ', end='')
        print(f'| {region[0]} |')
    header()

def main():
    parse_pyramids()
    oifrs = calc_overall_ifrs()
    show_overall_ifrs(oifrs)

if __name__ == '__main__':
    main()