-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathexperiment.py
169 lines (134 loc) · 7.07 KB
/
experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# pylint: disable = W0614, W0401, C0411
# the above errcodes correspond to unused wildcard import, wildcard import, wrong-import-order
# In fact, we can run pylint in cmd and set options like: pylint --disable=Cxxxx,Wxxxx yyyy.py zzzz.py
import argparse
import copy
from pathlib import Path
from loguru import logger
import numpy as np
parser = argparse.ArgumentParser()
# original dataset file
parser.add_argument("--priv_data", type=str, default="./data/accidential_drug_deaths.csv",
help="specify the path of original data file in csv format")
# priv_data_name for use of naming mile-stone files
parser.add_argument("--priv_data_name", type=str,
help="users must specify it to help mid-way naming and avoid possible mistakings")
# config file which include identifier and binning settings
parser.add_argument("--config", type=str, default="./config/data.yaml",
help="specify the path of config file in yaml format")
# the default number of records is set as 100
parser.add_argument("--n", type=int, default=0,
help="specify the number of records to generate")
# params file which include schema of the original dataset
parser.add_argument("--params", type=str, default="./data/parameters.json",
help="specify the path of parameters file in json format")
# datatype file which include the data types of the columns
parser.add_argument("--datatype", type=str, default="./data/column_datatypes.json",
help="specify the path of datatype file in json format")
# marginal_config which specify marginal usage method
parser.add_argument("--marginal_config", type=str, default="./config/eps=10.0.yaml",
help="specify the path of marginal config file in yaml format")
# hyper parameter, the num of update iterations
parser.add_argument("--update_iterations", type=int, default=30,
help="specify the num of update iterations")
# target path of synthetic dataset
parser.add_argument("--target_path", type=str, default="out.csv",
help="specify the target path of the synthetic dataset")
args = parser.parse_args()
PRIV_DATA = args.priv_data
PRIV_DATA_NAME = args.priv_data_name
CONFIG_DATA = args.config
PARAMS = args.params
DATA_TYPE = args.datatype
MARGINAL_CONFIG = args.marginal_config
UPDATE_ITERATIONS = args.update_iterations
TARGET_PATH = args.target_path
from data.DataLoader import *
from data.RecordPostprocessor import RecordPostprocessor
from method.dpsyn import DPSyn
def main():
np.random.seed(0)
np.random.RandomState(0)
with open(args.config, 'r', encoding="utf-8") as f:
config = yaml.load(f, Loader=yaml.BaseLoader)
# dataloader initialization
dataloader = DataLoader()
dataloader.load_data()
# default method is dpsyn
method = 'dpsyn'
n = args.n
priv_data = args.priv_data
priv_data_name = args.priv_data_name
syn_data = run_method(config, dataloader, n)
# if users set the records' num, we denote it in synthetic dataset's name
if n!=0:
print("------------------------> now we synthesize a dataset with ", n, "rows")
syn_data.to_csv(Path(TARGET_PATH), index=False)
# the default synthetic dataset name when n=0
else:
syn_data.to_csv(Path(TARGET_PATH), index=False)
def run_method(config, dataloader, n):
parameters = json.loads(Path(args.params).read_text())
syn_data = None
# each item in 'runs' specify one dp task with (eps, delta, sensitivity)
# as well as a possible 'max_records' value which bounds the dataset's size
for r in parameters["runs"]:
# 'max_records_per_individual' is the global sensitivity value of the designed function f
# here in the example f is the count, and you may change as you like
eps, delta, sensitivity = r['epsilon'], r['delta'], r['max_records_per_individual']
# we import logger in synthesizer.py
# we import DPSyn which inherits synthesizer
logger.info(f'working on eps={eps}, delta={delta}, and sensitivity={sensitivity}')
# we will use dpsyn to generate a dataset
"""I guess it helps by displaying the runtime logic below
1. DPSyn(Synthesizer)
it got dataloader, eps, delta, sensitivity
however, Synthesizer is so simple and crude(oh no it initializes the parameters in __init__)
2. we call synthesizer.synthesize(fixed_n=n) which is written in dpsyn.py
3. look at synthesize then
def synthesize(self, fixed_n=0) -> pd.DataFrame:
# def obtain_consistent_marginals(self, priv_marginal_config, priv_split_method) -> Marginals:
noisy_marginals = self.obtain_consistent_marginals()
4. it calls get_noisy_marginals() which is written in synthesizer.py
# noisy_marginals = self.get_noisy_marginals(priv_marginal_config, priv_split_method)
5. look at get_noisy_marginals()
# we firstly generate punctual marginals
priv_marginal_sets, epss = self.data.generate_marginal_by_config(self.data.private_data, priv_marginal_config)
# todo: consider fine-tuned noise-adding methods for one-way and two-way respectively?
# and now we add noises to get noisy marginals
noisy_marginals = self.anonymize(priv_marginal_sets, epss, priv_split_method)
6. look at generate_marginal_by_config() which is written in DataLoader.py
we need config files like
e.g.3.
priv_all_one_way: (or priv_all_two_way)
total_eps: xxxxx
7. look at anonymize() which is written in synthesizer.py
def anonymize(self, priv_marginal_sets: Dict, epss: Dict, priv_split_method: Dict) -> Marginals:
noisy_marginals = {}
for set_key, marginals in priv_marginal_sets.items():
eps = epss[set_key]
# noise_type, noise_param = advanced_composition.get_noise(eps, self.delta, self.sensitivity, len(marginals))
noise_type = priv_split_method[set_key]
(1)priv_split_method is hard_coded
(2) we decide the noise type by advanced_compisition()
"""
synthesizer = DPSyn(dataloader, eps, delta, sensitivity)
# tmp returns a DataFrame
tmp = synthesizer.synthesize(fixed_n=n)
# we add in the synthesized dataframe a new column which is 'epsilon'
# so when do comparison, you should remove this column for consistence
tmp['epsilon'] = eps
# syn_data is a list, tmp is added in the list
if syn_data is None:
syn_data = tmp
else:
syn_data = syn_data.append(tmp, ignore_index=True)
# post-processing generated data, map records with grouped/binned attribute back to original attributes
print("********************* START POSTPROCESSING ***********************")
postprocessor = RecordPostprocessor()
syn_data = postprocessor.post_process(syn_data, args.config, dataloader.decode_mapping)
logger.info("------------------------>synthetic data post-processed:")
print(syn_data)
return syn_data
if __name__ == "__main__":
main()