retrying cohort size variation

alexandriapawlik · alexandriapawlik · commit 6bcce9347b8c · 2020-04-17T23:52:43.000-04:00
diff --git a/1_each_client_partially_iid.py b/1_each_client_partially_iid.py
@@ -78,9 +78,6 @@ def go(self, num, batch):
 		multi_labels = np.asarray(multi_labels)
 		num_data_per_client = []
 
-		# print(multi_iid.shape)
-		# print(multi_labels.shape)
-
 		# assign each client non-IID and IID data
 		for client_num in range(self.CLIENTS):
 			# number of data points for this client
@@ -118,14 +115,19 @@ def go(self, num, batch):
 					indices_slice = indices[:data_per_label]
 				client_sample_x = np.append(client_sample_x, label_data_x[indices_slice], axis=0)
 				client_sample_y = np.append(client_sample_y, label_data_y[indices_slice], axis=0)
+
 				# count multiplicities
 				for i in range(len(indices_slice)):
 					multi_labels[label][int(indices_slice[i])] = multi_labels[label][int(indices_slice[i])] + 1
+					
 				# check data
 				if np.average(client_sample_y) > 9 or np.average(client_sample_y) < 0:
+					print("Error: At least one label out of range")
 					print(np.average(client_sample_y), label)
 					print()
 
+				# TODO: print data multiplicities
+
 			# track number of data points per client
 			num_data_per_client.append(len(client_sample_x))
 
diff --git a/3_shard.py b/3_shard.py
@@ -102,7 +102,9 @@ def go(self, num, batch):
 
 			# check data
 			if np.average(client_sample_y) > 9 or np.average(client_sample_y) < 0:
+				print("Error: At least one label out of range")
 				print(np.average(client_sample_y))
+				print()
 
 			# assign slices to single client
 			dataset = tf.data.Dataset.from_tensor_slices((client_sample_x, client_sample_y))
diff --git a/graphing.py b/graphing.py
@@ -0,0 +1,116 @@
+import numpy as np
+import math
+import csv
+import matplotlib.pyplot as plt
+
+batch = 5355833
+test = [1, 2, 3, 4]
+
+# for full batch:
+# number of rounds vs cohort size for all three schemas
+cohort_size = []
+rounds_s1 = []
+rounds_s3 = []
+rounds_s4 = []
+
+for i in range(len(test)):
+	# for each test:
+	# accuracy vs round number for all schemas
+	round_num = []
+	accuracy_s1 = []
+	accuracy_s3 = []
+	accuracy_s4 = []
+
+	filename = 'results/' + str(batch) + '/' + str(batch) + '.' + str(test[i]) + '.s1out.csv'
+	with open(filename,'r') as csvfile:
+		data = csv.reader(csvfile, delimiter=',')
+		header = next(data)
+		for row in data:
+			round_num.append(int(row[0]))
+			accuracy_s1.append(int(row[4]) * 100)
+		
+	# append last line to full batch data
+	rounds_s1.append(round_num[-1])
+
+	# filename = 'results/' + str(batch) + '/' + str(batch) + '.' + str(test[i]) + '.s3out.csv'
+	# with open(filename,'r') as csvfile:
+	# 	data = csv.reader(csvfile, delimiter=',')
+	# 	header = next(data)
+	# 	for row in data:
+	# 		round_num.append(int(row[0]))
+	# 		accuracy_s3.append(int(row[4]) * 100)
+		
+	# # append last line to full batch data
+	# rounds_s3.append(round_num[-1])
+
+	# filename = 'results/' + str(batch) + '/' + str(batch) + '.' + str(test[i]) + '.s4out.csv'
+	# with open(filename,'r') as csvfile:
+	# 	data = csv.reader(csvfile, delimiter=',')
+	# 	header = next(data)
+	# 	for row in data:
+	# 		round_num.append(int(row[0]))
+	# 		accuracy_s4.append(int(row[4]) * 100)
+		
+	# # append last line to full batch data
+	# rounds_s4.append(round_num[-1])
+
+	# get cohort size
+	filename = 'results/' + str(batch) + '/' + str(batch) + '.' + str(test[i]) + '.config.csv'
+	with open(filename,'r') as csvfile:
+		data = csv.reader(csvfile, delimiter=',')
+		header = next(data)
+		cohort_size.append(next(data)[0])
+
+	# # data for cohort size specific plot
+	# max_round = max(round_num) + 1
+	# round_num = range(1, max_round)
+	# plt.clf()
+	# plt.plot(round_num, accuracy_s1, label='Schema 1: Clients partially IID')
+	# plt.plot(round_num, accuracy_s3, label='Schema 3: Sharding')
+	# plt.plot(round_num, accuracy_s4, label='Schema 4: IID')
+	# plt.xlabel('Round Number')
+	# plt.ylabel('SCA Accuracy')
+	# plt.title('Round Accuracy for Cohort Size ' + str(cohort_size[-1]))
+	# plt.legend()
+	# plt.show()
+	# plt.savefig('results/' + str(batch) + '/' + str(batch) + '.' + str(test[i]) + '.accuracy_vs_round.png')
+
+
+# get 2CPU data
+
+batch = 5355833
+test = [1, 2, 3, 4]
+
+rounds_s1_2 = []
+
+for i in range(len(test)):
+	# for each test:
+	# accuracy vs round number for all schemas
+	round_num = []
+	accuracy_s1_2 = []
+
+	filename = 'results/' + str(batch) + '/' + str(batch) + '.' + str(test[i]) + '.s1out.csv'
+	with open(filename,'r') as csvfile:
+		data = csv.reader(csvfile, delimiter=',')
+		header = next(data)
+		for row in data:
+			round_num.append(int(row[0]))
+			accuracy_s1_2.append(int(row[4]) * 100)
+		
+	# append last line to full batch data
+	rounds_s1_2.append(round_num[-1])
+
+
+
+# data for efficiency by cohort size  plot
+plt.clf()
+plt.plot(cohort_size, rounds_s1, label='1 CPU')
+# plt.plot(cohort_size, rounds_s3, label='Schema 3: Sharding')
+# plt.plot(cohort_size, rounds_s4, label='Schema 4: IID')
+plt.plot(cohort_size, rounds_s1_2, label='2 CPU')
+plt.xlabel('Cohort Size')
+plt.ylabel('Number of Rounds to Reach 99% Accuracy')
+plt.title('Model Efficiency for Varying Cohort Size, Clients with Partially IID Data')
+plt.legend()
+plt.show()
+plt.savefig('results/' + str(batch) + '/' + str(batch) + '.rounds_vs_cohortsize.png')
diff --git a/info/batches.txt b/info/batches.txt
@@ -9,5 +9,6 @@ batch #		code				description & schema #
 5321083		error				invalid label values (only s1) s134, some tests finished
 5324828 	debug				invalid label debugging with output
 5352912		debug				invalid label debugging with output, solved!
-varying cohort size, 1 CPU per
-varying cohort size, 2 CPU per
+5355833		timed out		varying cohort size, 1 CPU per, s134
+
+											varying cohort size, 2 CPU per, s134
diff --git a/test_tff.sh b/test_tff.sh
@@ -19,6 +19,7 @@ module load cuda/10.1.105
 module list
 
 mkdir results/${SLURM_ARRAY_JOB_ID}
+mkdir results/${SLURM_ARRAY_JOB_ID}/log
 
 python tff_main.py $SLURM_ARRAY_TASK_ID $SLURM_ARRAY_JOB_ID
 
diff --git a/tff_main.py b/tff_main.py
@@ -37,8 +37,8 @@
 # # p2.go(test, batch)
 # # print(datetime.now())
 
-p3.go(test, batch)
-print(datetime.now())
+# p3.go(test, batch)
+# print(datetime.now())
 
-p4.go(test, batch)
-print(datetime.now())
+# p4.go(test, batch)
+# print(datetime.now())