df
| country | year | m014 | m1524 | m2534 | m3544 | m4554 | m5564 | m65 | mu | f014 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AD | 2000 | 0.0 | 0.0 | 1.0 | 0.0 | 0 | 0 | 0.0 | NaN | NaN |
| 1 | AE | 2000 | 2.0 | 4.0 | 4.0 | 6.0 | 5 | 12 | 10.0 | NaN | 3.0 |
| 2 | AF | 2000 | 52.0 | 228.0 | 183.0 | 149.0 | 129 | 94 | 80.0 | NaN | 93.0 |
| 3 | AG | 2000 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 1.0 | NaN | 1.0 |
| 4 | AL | 2000 | 2.0 | 19.0 | 21.0 | 14.0 | 24 | 19 | 16.0 | NaN | 3.0 |
| 5 | AM | 2000 | 2.0 | 152.0 | 130.0 | 131.0 | 63 | 26 | 21.0 | NaN | 1.0 |
| 6 | AN | 2000 | 0.0 | 0.0 | 1.0 | 2.0 | 0 | 0 | 0.0 | NaN | 0.0 |
| 7 | AO | 2000 | 186.0 | 999.0 | 1003.0 | 912.0 | 482 | 312 | 194.0 | NaN | 247.0 |
| 8 | AR | 2000 | 97.0 | 278.0 | 594.0 | 402.0 | 419 | 368 | 330.0 | NaN | 121.0 |
| 9 | AS | 2000 | NaN | NaN | NaN | NaN | 1 | 1 | NaN | NaN | NaN |
melted = pd.melt(df, id_vars=["country","year"], value_name="cases",
var_name="sex_and_age")
melted
| country | year | sex_and_age | cases | |
|---|---|---|---|---|
| 0 | AD | 2000 | m014 | 0.0 |
| 1 | AE | 2000 | m014 | 2.0 |
| 2 | AF | 2000 | m014 | 52.0 |
| 3 | AG | 2000 | m014 | 0.0 |
| 4 | AL | 2000 | m014 | 2.0 |
| ... | ... | ... | ... | ... |
| 85 | AM | 2000 | f014 | 1.0 |
| 86 | AN | 2000 | f014 | 0.0 |
| 87 | AO | 2000 | f014 | 247.0 |
| 88 | AR | 2000 | f014 | 121.0 |
| 89 | AS | 2000 | f014 | NaN |
90 rows × 4 columns
tmp_df = melted["sex_and_age"].str.extract(
"(?P<sex>\D)(?P<age_lower>\d+)(?P<age_upper>\d{2})")
tmp_df
| sex | age_lower | age_upper | |
|---|---|---|---|
| 0 | m | 0 | 14 |
| 10 | m | 15 | 24 |
| 20 | m | 25 | 34 |
| 30 | m | 35 | 44 |
| 40 | m | 45 | 54 |
| 50 | m | 55 | 64 |
| 60 | NaN | NaN | NaN |
| 81 | f | 0 | 14 |
# Merge
melted = pd.concat([melted, tmp_df], axis=1)
melted
| country | year | sex_and_age | cases | sex | age_lower | age_upper | |
|---|---|---|---|---|---|---|---|
| 0 | AD | 2000 | m014 | 0.0 | m | 0 | 14 |
| 33 | AG | 2000 | m3544 | 0.0 | m | 35 | 44 |
| 30 | AD | 2000 | m3544 | 0.0 | m | 35 | 44 |
| 43 | AG | 2000 | m4554 | 0.0 | m | 45 | 54 |
| 46 | AN | 2000 | m4554 | 0.0 | m | 45 | 54 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 47 | AO | 2000 | m4554 | 482.0 | m | 45 | 54 |
| 28 | AR | 2000 | m2534 | 594.0 | m | 25 | 34 |
| 37 | AO | 2000 | m3544 | 912.0 | m | 35 | 44 |
| 17 | AO | 2000 | m1524 | 999.0 | m | 15 | 24 |
| 27 | AO | 2000 | m2534 | 1003.0 | m | 25 | 34 |
64 rows × 7 columns
melted["age"] = melted["age_lower"] + "-" + melted["age_upper"]
melted
| country | year | sex_and_age | cases | sex | age_lower | age_upper | age | |
|---|---|---|---|---|---|---|---|---|
| 0 | AD | 2000 | m014 | 0.0 | m | 0 | 14 | 0-14 |
| 33 | AG | 2000 | m3544 | 0.0 | m | 35 | 44 | 35-44 |
| 30 | AD | 2000 | m3544 | 0.0 | m | 35 | 44 | 35-44 |
| 43 | AG | 2000 | m4554 | 0.0 | m | 45 | 54 | 45-54 |
| 46 | AN | 2000 | m4554 | 0.0 | m | 45 | 54 | 45-54 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 47 | AO | 2000 | m4554 | 482.0 | m | 45 | 54 | 45-54 |
| 28 | AR | 2000 | m2534 | 594.0 | m | 25 | 34 | 25-34 |
| 37 | AO | 2000 | m3544 | 912.0 | m | 35 | 44 | 35-44 |
| 17 | AO | 2000 | m1524 | 999.0 | m | 15 | 24 | 15-24 |
| 27 | AO | 2000 | m2534 | 1003.0 | m | 25 | 34 | 25-34 |
64 rows × 8 columns
melted = melted.drop(['sex_and_age',"age_lower","age_upper"], axis=1)
melted = melted.dropna()
melted = melted.sort_values(by=["country", "year", "sex", "age"])
melted
| country | year | cases | sex | age | |
|---|---|---|---|---|---|
| 0 | AD | 2000 | 0.0 | m | 0-14 |
| 10 | AD | 2000 | 0.0 | m | 15-24 |
| 20 | AD | 2000 | 1.0 | m | 25-34 |
| 30 | AD | 2000 | 0.0 | m | 35-44 |
| 40 | AD | 2000 | 0.0 | m | 45-54 |
| ... | ... | ... | ... | ... | ... |
| 38 | AR | 2000 | 402.0 | m | 35-44 |
| 48 | AR | 2000 | 419.0 | m | 45-54 |
| 58 | AR | 2000 | 368.0 | m | 55-64 |
| 49 | AS | 2000 | 1.0 | m | 45-54 |
| 59 | AS | 2000 | 1.0 | m | 55-64 |
64 rows × 5 columns