df
country | year | m014 | m1524 | m2534 | m3544 | m4554 | m5564 | m65 | mu | f014 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | AD | 2000 | 0.0 | 0.0 | 1.0 | 0.0 | 0 | 0 | 0.0 | NaN | NaN |
1 | AE | 2000 | 2.0 | 4.0 | 4.0 | 6.0 | 5 | 12 | 10.0 | NaN | 3.0 |
2 | AF | 2000 | 52.0 | 228.0 | 183.0 | 149.0 | 129 | 94 | 80.0 | NaN | 93.0 |
3 | AG | 2000 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 1.0 | NaN | 1.0 |
4 | AL | 2000 | 2.0 | 19.0 | 21.0 | 14.0 | 24 | 19 | 16.0 | NaN | 3.0 |
5 | AM | 2000 | 2.0 | 152.0 | 130.0 | 131.0 | 63 | 26 | 21.0 | NaN | 1.0 |
6 | AN | 2000 | 0.0 | 0.0 | 1.0 | 2.0 | 0 | 0 | 0.0 | NaN | 0.0 |
7 | AO | 2000 | 186.0 | 999.0 | 1003.0 | 912.0 | 482 | 312 | 194.0 | NaN | 247.0 |
8 | AR | 2000 | 97.0 | 278.0 | 594.0 | 402.0 | 419 | 368 | 330.0 | NaN | 121.0 |
9 | AS | 2000 | NaN | NaN | NaN | NaN | 1 | 1 | NaN | NaN | NaN |
melted = pd.melt(df, id_vars=["country","year"], value_name="cases",
var_name="sex_and_age")
melted
country | year | sex_and_age | cases | |
---|---|---|---|---|
0 | AD | 2000 | m014 | 0.0 |
1 | AE | 2000 | m014 | 2.0 |
2 | AF | 2000 | m014 | 52.0 |
3 | AG | 2000 | m014 | 0.0 |
4 | AL | 2000 | m014 | 2.0 |
... | ... | ... | ... | ... |
85 | AM | 2000 | f014 | 1.0 |
86 | AN | 2000 | f014 | 0.0 |
87 | AO | 2000 | f014 | 247.0 |
88 | AR | 2000 | f014 | 121.0 |
89 | AS | 2000 | f014 | NaN |
90 rows × 4 columns
tmp_df = melted["sex_and_age"].str.extract(
"(?P<sex>\D)(?P<age_lower>\d+)(?P<age_upper>\d{2})")
tmp_df
sex | age_lower | age_upper | |
---|---|---|---|
0 | m | 0 | 14 |
10 | m | 15 | 24 |
20 | m | 25 | 34 |
30 | m | 35 | 44 |
40 | m | 45 | 54 |
50 | m | 55 | 64 |
60 | NaN | NaN | NaN |
81 | f | 0 | 14 |
# Merge
melted = pd.concat([melted, tmp_df], axis=1)
melted
country | year | sex_and_age | cases | sex | age_lower | age_upper | |
---|---|---|---|---|---|---|---|
0 | AD | 2000 | m014 | 0.0 | m | 0 | 14 |
33 | AG | 2000 | m3544 | 0.0 | m | 35 | 44 |
30 | AD | 2000 | m3544 | 0.0 | m | 35 | 44 |
43 | AG | 2000 | m4554 | 0.0 | m | 45 | 54 |
46 | AN | 2000 | m4554 | 0.0 | m | 45 | 54 |
... | ... | ... | ... | ... | ... | ... | ... |
47 | AO | 2000 | m4554 | 482.0 | m | 45 | 54 |
28 | AR | 2000 | m2534 | 594.0 | m | 25 | 34 |
37 | AO | 2000 | m3544 | 912.0 | m | 35 | 44 |
17 | AO | 2000 | m1524 | 999.0 | m | 15 | 24 |
27 | AO | 2000 | m2534 | 1003.0 | m | 25 | 34 |
64 rows × 7 columns
melted["age"] = melted["age_lower"] + "-" + melted["age_upper"]
melted
country | year | sex_and_age | cases | sex | age_lower | age_upper | age | |
---|---|---|---|---|---|---|---|---|
0 | AD | 2000 | m014 | 0.0 | m | 0 | 14 | 0-14 |
33 | AG | 2000 | m3544 | 0.0 | m | 35 | 44 | 35-44 |
30 | AD | 2000 | m3544 | 0.0 | m | 35 | 44 | 35-44 |
43 | AG | 2000 | m4554 | 0.0 | m | 45 | 54 | 45-54 |
46 | AN | 2000 | m4554 | 0.0 | m | 45 | 54 | 45-54 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
47 | AO | 2000 | m4554 | 482.0 | m | 45 | 54 | 45-54 |
28 | AR | 2000 | m2534 | 594.0 | m | 25 | 34 | 25-34 |
37 | AO | 2000 | m3544 | 912.0 | m | 35 | 44 | 35-44 |
17 | AO | 2000 | m1524 | 999.0 | m | 15 | 24 | 15-24 |
27 | AO | 2000 | m2534 | 1003.0 | m | 25 | 34 | 25-34 |
64 rows × 8 columns
melted = melted.drop(['sex_and_age',"age_lower","age_upper"], axis=1)
melted = melted.dropna()
melted = melted.sort_values(by=["country", "year", "sex", "age"])
melted
country | year | cases | sex | age | |
---|---|---|---|---|---|
0 | AD | 2000 | 0.0 | m | 0-14 |
10 | AD | 2000 | 0.0 | m | 15-24 |
20 | AD | 2000 | 1.0 | m | 25-34 |
30 | AD | 2000 | 0.0 | m | 35-44 |
40 | AD | 2000 | 0.0 | m | 45-54 |
... | ... | ... | ... | ... | ... |
38 | AR | 2000 | 402.0 | m | 35-44 |
48 | AR | 2000 | 419.0 | m | 45-54 |
58 | AR | 2000 | 368.0 | m | 55-64 |
49 | AS | 2000 | 1.0 | m | 45-54 |
59 | AS | 2000 | 1.0 | m | 55-64 |
64 rows × 5 columns