|
48 | 48 | "def separateByClass(dataset):\n",
|
49 | 49 | " separated = {}\n",
|
50 | 50 | " for i in range(len(dataset)):\n",
|
51 |
| - " x = dataset[i]\n", |
| 51 | + " x = dataset[i] # current row\n", |
52 | 52 | " if (x[-1] not in separated):\n",
|
53 | 53 | " separated[x[-1]] = []\n",
|
54 | 54 | " separated[x[-1]].append(x)\n",
|
|
57 | 57 | "def compute_mean_std(dataset):\n",
|
58 | 58 | " mean_std = [ (st.mean(attribute), st.stdev(attribute))\n",
|
59 | 59 | " for attribute in zip(*dataset)]; #zip(*res) transposes a matrix (2-d array/list)\n",
|
60 |
| - " del mean_std[-1] # Exclude label\n", |
| 60 | + " del mean_std[-1] # Exclude label, i.e., target\n", |
61 | 61 | " return mean_std\n",
|
62 | 62 | "\n",
|
63 |
| - "def summarizeByClass(dataset):\n", |
| 63 | + "def summarizeByClass(dataset): # summary is the mean and STD of class values\n", |
64 | 64 | " separated = separateByClass(dataset);\n",
|
65 | 65 | " summary = {} # to store mean and std of +ve and -ve instances\n",
|
66 | 66 | " for classValue, instances in separated.items():\n",
|
67 | 67 | " #summaries is a dictionary of tuples(mean,std) for each class value\n",
|
68 | 68 | " summary[classValue] = compute_mean_std(instances)\n",
|
69 | 69 | " return summary\n",
|
70 |
| - "#For continuous attributes p is estimated using Gaussion distribution\n", |
| 70 | + "\n", |
| 71 | + "#For continuous attributes, p is estimated using Gaussian distribution\n", |
71 | 72 | "def estimateProbability(x, mean, stdev):\n",
|
72 | 73 | " exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))\n",
|
73 | 74 | " return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent\n",
|
74 | 75 | "\n",
|
75 |
| - "\n", |
| 76 | + "# calculate class probabilities of that entire row (testVector)\n", |
76 | 77 | "def calculateClassProbabilities(summaries, testVector):\n",
|
77 | 78 | " p = {}\n",
|
78 | 79 | " #class and attribute information as mean and sd\n",
|
79 | 80 | " for classValue, classSummaries in summaries.items():\n",
|
80 | 81 | " p[classValue] = 1\n",
|
81 | 82 | " for i in range(len(classSummaries)):\n",
|
82 | 83 | " mean, stdev = classSummaries[i]\n",
|
83 |
| - " x = testVector[i] #testvector's first attribute\n", |
| 84 | + " x = testVector[i] #testvector's i-th attribute\n", |
| 85 | + " \n", |
84 | 86 | " #use normal distribution\n",
|
85 |
| - " p[classValue] *= estimateProbability(x, mean, stdev);\n", |
| 87 | + " p[classValue] *= estimateProbability(x, mean, stdev)\n", |
| 88 | + " \n", |
86 | 89 | " return p\n",
|
87 | 90 | "\n",
|
| 91 | + "# calculate best out of all class probabilities of that entire row (testVector)\n", |
88 | 92 | "def predict(summaries, testVector):\n",
|
89 | 93 | " all_p = calculateClassProbabilities(summaries, testVector)\n",
|
90 | 94 | " bestLabel, bestProb = None, -1\n",
|
|
94 | 98 | " bestLabel = lbl\n",
|
95 | 99 | " return bestLabel\n",
|
96 | 100 | "\n",
|
| 101 | + "# find predicted class for each row in testSet\n", |
97 | 102 | "def perform_classification(summaries, testSet):\n",
|
98 | 103 | " predictions = []\n",
|
99 | 104 | " for i in range(len(testSet)):\n",
|
|
116 | 121 | "print(\"First Five instances of dataset:\")\n",
|
117 | 122 | "for i in range(5):\n",
|
118 | 123 | " print(i+1 , ':' , dataset[i])\n",
|
| 124 | + " \n", |
| 125 | + "\n", |
119 | 126 | "splitRatio = 0.2\n",
|
120 | 127 | "trainingSet, testSet = splitDataset(dataset, splitRatio)\n",
|
| 128 | + "\n", |
121 | 129 | "print('\\nDataset is split into training and testing set.')\n",
|
122 |
| - "print('Training examples = {0} \\nTesting examples = {1}'.format(len(trainingSet),\n", |
123 |
| - "len(testSet)))\n", |
| 130 | + "print('Training examples = {0} \\nTesting examples = {1}'.format(len(trainingSet), len(testSet)))\n", |
| 131 | + "\n", |
124 | 132 | "summaries = summarizeByClass(trainingSet);\n",
|
| 133 | + "\n", |
125 | 134 | "predictions = perform_classification(summaries, testSet)\n",
|
| 135 | + "\n", |
126 | 136 | "accuracy = getAccuracy(testSet, predictions)\n",
|
| 137 | + "\n", |
127 | 138 | "print('\\nAccuracy of the Naive Baysian Classifier is :', accuracy)"
|
128 | 139 | ]
|
129 | 140 | },
|
|
0 commit comments