| | |
| | import re |
| | import yaml |
| | import pickle |
| |
|
| | import numpy as np |
| | import pandas as pd |
| |
|
| | from lifelines import LogLogisticAFTFitter |
| | from KaplanMeierEstimator import KaplanMeierEstimator |
| |
|
| | import warnings |
| | warnings.filterwarnings('ignore') |
| |
|
| | class CanSave: |
| | '''Object of the Can-Save method for feature engineering.''' |
| | def __load_object_by_pickle(self, path): |
| | '''Method to load a deserializated file.''' |
| | loaded_obj = pickle.load(open(path, 'rb')) |
| | return loaded_obj |
| |
|
| | def __load_config(self, config_path): |
| | '''Method to load config-file.''' |
| | with open(config_path, 'r') as file: |
| | self.config = yaml.safe_load(file) |
| |
|
| | def __load_ICD10_groups(self): |
| | '''Method to load the table of ICD-10 groups.''' |
| | path_icd10_groups = self.config['path_icd10_groups'] |
| | groups = pd.read_excel(path_icd10_groups).dropna(subset=['left_code', 'right_code']) |
| | groups = groups[groups['selected'] == 1] |
| | self.groups = groups |
| |
|
| | def __load_survival_models(self): |
| | '''Method to load YOUR trained survival models.''' |
| |
|
| | ''' |
| | DISCLAIMER! |
| | We do not make trained models publicly available. |
| | Therefore, we only provide an example of how survival models can be trained. |
| | Example is located in the "Example_How_To_Train_Survival_Models.py". |
| | ''' |
| |
|
| | |
| | kaplan_meier_males = self.__load_object_by_pickle(self.config['path_kaplan_meier_males']) |
| | kaplan_meier_females = self.__load_object_by_pickle(self.config['path_kaplan_meier_females']) |
| | kaplan_meier_both = self.__load_object_by_pickle(self.config['path_kaplan_meier_both']) |
| | aft = self.__load_object_by_pickle(self.config['path_aft']) |
| |
|
| | |
| | self.survival_models = { |
| | 'kaplan_meier_males': kaplan_meier_males, |
| | 'kaplan_meier_females': kaplan_meier_females, |
| | 'kaplan_meier_both': kaplan_meier_both, |
| | 'aft': aft, |
| | } |
| |
|
| | def __init__(self, CONFIG_PATH): |
| | '''Constructor.''' |
| | |
| | self.__load_config(CONFIG_PATH) |
| | |
| | self.__load_ICD10_groups() |
| | |
| | self.__load_survival_models() |
| |
|
| | def __check_russian_service_code(self, code): |
| | '''Method to check the medical service code used in the Russian Federation (e.g.: A04.16, B01.042).''' |
| | code = str(code).upper() |
| | mas = code.split('.') |
| | first = mas[0] |
| | second = mas[1] if len(mas) > 1 else '00' |
| |
|
| | if 'A' in first: |
| | length = 2 |
| | elif 'B' in first: |
| | length = 3 |
| | else: |
| | raise ValueError(f'Invalid medical code: {code}') |
| |
|
| | if len(first) < 3: |
| | first = first[0] + '0' + first[1] |
| |
|
| | if len(second) == length: |
| | pass |
| | elif len(second) > length: |
| | second = second[-length:] |
| | else: |
| | while len(second) != length: |
| | second = '0' + second |
| |
|
| | code = '{}.{}'.format(first, second) |
| |
|
| | return code |
| |
|
| | def __common_features(self, features, ehr, date_start, date_pred): |
| | '''Method to make common features.''' |
| | |
| | month_pred = date_pred.month |
| | features['month_pred'] = month_pred |
| |
|
| | |
| | features['day_of_week'] = date_pred.isoweekday() |
| |
|
| | |
| | visit_num = len(ehr) |
| | features['visit_num'] = visit_num |
| |
|
| | |
| | features['diagnosis_prop'] = ehr['is_diagnose'].mean() |
| | features['services_prop'] = 1.0 - ehr['is_diagnose'].mean() |
| |
|
| | |
| | features['weeks_after_first_visit'] = (date_pred - ehr['date'].min()).days / 7. |
| |
|
| | |
| | features['weeks_after_last_visit'] = (date_pred - ehr['date'].max()).days / 7. |
| |
|
| | |
| | features['weeks_betw_visits_avg'] = (ehr['date'].max() - ehr['date'].min()).days / (visit_num + 1.0e-14) |
| |
|
| | |
| | for month in [1, 3, 6, 9, 12, 15, 18, 21, 24]: |
| | end_dt = date_pred |
| | start_dt = date_pred - pd.DateOffset(months=month) |
| | mask = (start_dt <= ehr['date']) & (ehr['date'] <= end_dt) |
| |
|
| | new_col = f'visits_in_last_{month}_months' |
| | features[new_col] = mask.sum() |
| |
|
| | |
| | mask = ehr['is_diagnose'] == 1 |
| | df = ehr[mask] |
| | df['code_short'] = df['code'].apply(lambda code: str(code).split('.')[0]) |
| |
|
| | for left_mkb, right_mkb in zip(self.groups['left_code'], self.groups['right_code']): |
| | mask = (left_mkb <= df['code_short']) & (df['code_short'] <= right_mkb) |
| | new_col = 'diagnose_group_{}_{}'.format(left_mkb, right_mkb) |
| | features[new_col] = mask.sum() |
| |
|
| | |
| | features['unique_diagnosis'] = len(set(df['code_short'])) |
| |
|
| | |
| | df['code_class'] = df['code_short'].apply(lambda code: re.match('[A-Z]', str(code)).group(0)) |
| | features['unique_classes_of_diagnosis'] = len(set(df['code_class'])) |
| |
|
| | |
| | for left_mkb, right_mkb in zip(self.groups['left_code'], self.groups['right_code']): |
| | feature = f'diagnose_group_{left_mkb}_{right_mkb}' |
| | val = features[feature] |
| |
|
| | new_col = f'has_{left_mkb}_{right_mkb}' |
| | features[new_col] = 1 if val > 0 else 0 |
| |
|
| | |
| | for left_mkb, right_mkb in zip(self.groups['left_code'], self.groups['right_code']): |
| | mask = (left_mkb <= df['code']) & (df['code'] <= right_mkb) |
| | df_group = df[mask] |
| |
|
| | new_col = f'weeks_from_first_{left_mkb}_{right_mkb}' |
| | if len(df_group) > 0: |
| | features[new_col] = (date_pred - df_group['date'].min()).days / 7. |
| | else: |
| | features[new_col] = (date_pred - date_start).days / 7. |
| |
|
| | new_col = f'weeks_from_last_{left_mkb}_{right_mkb}' |
| | if len(df_group) > 0: |
| | features[new_col] = (date_pred - df_group['date'].max()).days / 7. |
| | else: |
| | features[new_col] = (date_pred - date_start).days / 7. |
| |
|
| | return features |
| |
|
| | def __regional_features(self, features, ehr, date_start, date_pred): |
| | ''' |
| | Method to make features adopted to some regional specific characteristics. |
| | This method should be adopted for your country. |
| | ''' |
| | |
| | avg_humidity = { |
| | 1: 88., 2: 84., 3: 74., 4: 65., 5: 64., 6: 68., |
| | 7: 72., 8: 69., 9: 77., 10: 81., 11: 88., 12: 91. |
| | } |
| | month_pred = features['month_pred'] |
| | features['avg_humidity'] = avg_humidity[month_pred] |
| |
|
| | |
| | avg_temperature = { |
| | 1: -7.2, 2: -5.2, 3: -0.5, 4: 8.1, 5: 15.5, 6: 18.0, |
| | 7: 20.3, 8: 18.9, 9: 13.3, 10: 5.7, 11: 1.3, 12: -3.4 |
| | } |
| | features['avg_temperature'] = avg_temperature[month_pred] |
| |
|
| | |
| | mask = ehr['is_diagnose'] == 0 |
| | df = ehr[mask] |
| | df['code_mod'] = df['code'].apply(self.__check_russian_service_code) |
| |
|
| | |
| | codes = [f'A0{num}.' if num < 10 else f'A{num}.' for num in range(30+1)] + ['B01.','B02.','B03.','B04.','B05.'] |
| | for val in codes: |
| | mask = df['code_mod'].str.contains(val) |
| | new_col = 'service_group_{}XX'.format(val) |
| | features[new_col] = mask.sum() |
| |
|
| | |
| | group1 = [f'.0{num}' if num < 10 else f'.{num}' for num in range(1, 30+1)] |
| | group2 = [f'.00{num}' if num < 10 else f'.0{num}' for num in range(1, 70+1)] |
| | codes = group1 + group2 |
| | for val in codes: |
| | mask = df['code_mod'].str.contains(val) |
| | new_col = 'service_group_Axx{}'.format(val) |
| | features[new_col] = mask.sum() |
| |
|
| | return features |
| |
|
| | def __survival_analysis_features(self, features): |
| | '''Method to make features based on the trained survival models.''' |
| | |
| | sex = features['sex'] |
| | age = features['actual_age'] |
| | H = list(range(0, 24+1)) |
| |
|
| | |
| | model = self.survival_models['kaplan_meier_both'] |
| | s_age = model(age) |
| | for horizon in H: |
| | |
| | t = age + horizon/12. |
| | s = model(t) |
| | new_col = f'KaplanMeier_BOTH_S(AGE+{horizon}M)' |
| | features[new_col] = s |
| |
|
| | |
| | ds = s - s_age |
| | new_col = f'KaplanMeier_BOTH_|S(AGE+{horizon}M)-S(AGE)|' |
| | features[new_col] = abs(ds) |
| |
|
| | |
| | model = self.survival_models['kaplan_meier_males'] if sex == 1 else self.survival_models['kaplan_meier_females'] |
| | s_age = model(age) |
| | for horizon in H: |
| | |
| | t = age + horizon/12. |
| | s = model(t) |
| | new_col = f'KaplanMeier_SEX_S(AGE+{horizon}M)' |
| | features[new_col] = s |
| |
|
| | |
| | ds = s - s_age |
| | new_col = f'KaplanMeier_SEX_|S(AGE+{horizon}M)-S(AGE)|' |
| | features[new_col] = abs(ds) |
| |
|
| | |
| | model = self.survival_models['aft']['model'] |
| | covariates = self.survival_models['aft']['covariates'] |
| |
|
| | df = pd.DataFrame([{key:features[key] for key in covariates}]) |
| | times = [age + val/12. for val in H] |
| | survivals = model.predict_survival_function(df, times=times) |
| |
|
| | s_age = survivals[0].iloc[0] |
| | for i in range(len(survivals)): |
| | |
| | horizon = H[i] |
| | s = survivals[0].iloc[i] |
| | new_col = f'AFT_S(AGE+{horizon}M)' |
| | features[new_col] = s |
| |
|
| | |
| | ds = s - s_age |
| | new_col = f'AFT_|S(AGE+{horizon}M)-S(AGE)|' |
| | features[new_col] = abs(ds) |
| |
|
| | return features |
| |
|
| | def feature_engineering(self, |
| | sex: str, |
| | birth_date: str, |
| | ehr: pd.DataFrame, |
| | date_pred: str, |
| | deep_weeks: int |
| | ): |
| | ''' |
| | Method to make feature engineering for the Can-Save method. |
| | sex: str # 'F' - FEMALE, 'M' - MALE |
| | birth_date: str # 'YYYY-MM-DD' |
| | ehr: pd.DataFrame # Example is located in './EHR/id_26.csv' |
| | date_pred: str # 'YYYY-MM-DD' |
| | deep_weeks: int # deep_weeks > 0 |
| | ''' |
| | |
| | set_of_required_columns = {'date', 'code', 'is_diagnose'} |
| | missing_columns = set_of_required_columns.difference(set(ehr.columns)) |
| | if len(missing_columns) > 0: |
| | raise KeyError(f'There are missing columns: {missing_columns}.') |
| |
|
| | |
| | ehr['date'] = pd.to_datetime(ehr['date'], format='%Y-%m-%d') |
| | date_pred = pd.to_datetime([date_pred], format='%Y-%m-%d')[0] |
| | date_start = date_pred - pd.DateOffset(weeks=deep_weeks) |
| |
|
| | |
| | sex = 1 if 'M' in sex else 0 |
| | birth_date = pd.to_datetime([birth_date], format='%Y-%m-%d')[0] |
| |
|
| | |
| | mask = (date_start <= ehr['date']) & (ehr['date'] <= date_pred) |
| | ehr = ehr[mask] |
| |
|
| | |
| | if len(ehr) == 0: |
| | ehr.loc[0] = {'date': date_pred, 'code': 'A00.00', 'is_diagnose': 0} |
| |
|
| | |
| | days_per_year = (365 * 3 + 366) / 4. |
| | features = { |
| | 'sex': sex, |
| | 'actual_age': (date_pred - birth_date).days / days_per_year, |
| | } |
| |
|
| | |
| | features = self.__common_features(features, ehr, date_start, date_pred) |
| |
|
| | |
| | features = self.__regional_features(features, ehr, date_start, date_pred) |
| |
|
| | |
| | features = self.__survival_analysis_features(features) |
| |
|
| | return features |
| |
|
| | |
| | if __name__ == '__main__': |
| | |
| | config_path = './CONFIG_CanSave.yaml' |
| | cs = CanSave(CONFIG_PATH=config_path) |
| | print(help(cs)) |
| |
|
| | |
| | path_ehr = './EHR/id_26.csv' |
| | ehr = pd.read_csv(path_ehr, sep=';').set_index('patient_id') |
| | sex = ehr['sex'].iloc[0] |
| | birth_date = ehr['birth_date'].iloc[0] |
| |
|
| | |
| | features = cs.feature_engineering( |
| | sex = sex, |
| | birth_date = birth_date, |
| | ehr = ehr, |
| | date_pred = '2022-01-01', |
| | deep_weeks = 108 |
| | ) |
| |
|