pavanpej
diff --git a/‎.ipynb_checkpoints/Prog5-checkpoint.ipynb
Lines changed: 20 additions & 9 deletions b/‎.ipynb_checkpoints/Prog5-checkpoint.ipynb
Lines changed: 20 additions & 9 deletions
@@ -48,7 +48,7 @@
     "def separateByClass(dataset):\n",
     "    separated = {}\n",
     "    for i in range(len(dataset)):\n",
-    "        x = dataset[i]\n",
+    "        x = dataset[i] # current row\n",
     "        if (x[-1] not in separated):\n",
     "            separated[x[-1]] = []\n",
     "        separated[x[-1]].append(x)\n",
@@ -57,34 +57,38 @@
     "def compute_mean_std(dataset):\n",
     "    mean_std = [ (st.mean(attribute), st.stdev(attribute))\n",
     "        for attribute in zip(*dataset)]; #zip(*res) transposes a matrix (2-d array/list)\n",
-    "    del mean_std[-1] # Exclude label\n",
+    "    del mean_std[-1] # Exclude label, i.e., target\n",
     "    return mean_std\n",
     "\n",
-    "def summarizeByClass(dataset):\n",
+    "def summarizeByClass(dataset): # summary is the mean and STD of class values\n",
     "    separated = separateByClass(dataset);\n",
     "    summary = {} # to store mean and std of +ve and -ve instances\n",
     "    for classValue, instances in separated.items():\n",
     "        #summaries is a dictionary of tuples(mean,std) for each class value\n",
     "        summary[classValue] = compute_mean_std(instances)\n",
     "    return summary\n",
-    "#For continuous attributes p is estimated using Gaussion distribution\n",
+    "\n",
+    "#For continuous attributes, p is estimated using Gaussian distribution\n",
     "def estimateProbability(x, mean, stdev):\n",
     "    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))\n",
     "    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent\n",
     "\n",
-    "\n",
+    "# calculate class probabilities of that entire row (testVector)\n",
     "def calculateClassProbabilities(summaries, testVector):\n",
     "    p = {}\n",
     "    #class and attribute information as mean and sd\n",
     "    for classValue, classSummaries in summaries.items():\n",
     "        p[classValue] = 1\n",
     "        for i in range(len(classSummaries)):\n",
     "            mean, stdev = classSummaries[i]\n",
-    "            x = testVector[i] #testvector's first attribute\n",
+    "            x = testVector[i] #testvector's i-th attribute\n",
+    "            \n",
     "            #use normal distribution\n",
-    "            p[classValue] *= estimateProbability(x, mean, stdev);\n",
+    "            p[classValue] *= estimateProbability(x, mean, stdev)\n",
+    "            \n",
     "    return p\n",
     "\n",
+    "# calculate best out of all class probabilities of that entire row (testVector)\n",
     "def predict(summaries, testVector):\n",
     "    all_p = calculateClassProbabilities(summaries, testVector)\n",
     "    bestLabel, bestProb = None, -1\n",
@@ -94,6 +98,7 @@
     "            bestLabel = lbl\n",
     "    return bestLabel\n",
     "\n",
+    "# find predicted class for each row in testSet\n",
     "def perform_classification(summaries, testSet):\n",
     "    predictions = []\n",
     "    for i in range(len(testSet)):\n",
@@ -116,14 +121,20 @@
     "print(\"First Five instances of dataset:\")\n",
     "for i in range(5):\n",
     "    print(i+1 , ':' , dataset[i])\n",
+    "    \n",
+    "\n",
     "splitRatio = 0.2\n",
     "trainingSet, testSet = splitDataset(dataset, splitRatio)\n",
+    "\n",
     "print('\\nDataset is split into training and testing set.')\n",
-    "print('Training examples = {0} \\nTesting examples = {1}'.format(len(trainingSet),\n",
-    "len(testSet)))\n",
+    "print('Training examples = {0} \\nTesting examples = {1}'.format(len(trainingSet), len(testSet)))\n",
+    "\n",
     "summaries = summarizeByClass(trainingSet);\n",
+    "\n",
     "predictions = perform_classification(summaries, testSet)\n",
+    "\n",
     "accuracy = getAccuracy(testSet, predictions)\n",
+    "\n",
     "print('\\nAccuracy of the Naive Baysian Classifier is :', accuracy)"
    ]
   },