# # athletes2.py # # Code for the classifier example from chapter 5 # The dataset is a set of athletes' heights and weights # We classify athletes into gymnastics, track, and basketball # # This code contains routines for a single nearest neighbor # and a k nearest neighbor (kNN) approach # # Code file for the book Programmer's Guide to Data Mining # http://guidetodatamining.com # Ron Zacharski # from math import sqrt import random athletes = {"Shawn Johnson": [57, 90], "Li Shanshan": [57, 79], "Deng Linlin": [54, 68], "Bridget Sloan": [59, 104], "Nastia Liukin": [63, 99], "Ksenia Semenova": [54, 77], "Jennifer Lacy": [75, 175], "Shavonte Zellous": [70, 155], "Nakia Sanford": [76, 200], "Shanna Crossley": [70, 155], "Brittainey Raven": [72, 162], "Nikki Blue": [68,163], "Dita Constantina": [65, 105], "Lisa Hunter-Galvan": [66, 132], "Mara Yamauchi": [64, 112], "Blake Russell": [66, 110], "Martha Komu": [65, 115], "Jelena Prokopcuka": [66, 112]} sport = {"Shawn Johnson": "Gymnastics", "Li Shanshan": "Gymnastics", "Deng Linlin": "Gymnastics", "Bridget Sloan": "Gymnastics", "Nastia Liukin": "Gymnastics", "Ksenia Semenova": "Gymnastics", "Jennifer Lacy": "Basketball", "Shavonte Zellous": "Basketball", "Nakia Sanford": "Basketball", "Shanna Crossley": "Basketball", "Brittainey Raven": "Basketball", "Nikki Blue": "Basketball", "Dita Constantina": "Track", "Lisa Hunter-Galvan": "Track", "Mara Yamauchi": "Track", "Blake Russell": "Track", "Martha Komu": "Track", "Jelena Prokopcuka": "Track"} class classifier: def __init__(self, data = {}, category = {}): self.data = data self.category = category self.normalizeValues = {} def loadAthleteData(self, filename): f = open(filename) i = 0 for line in f: elements = line.split('\t') if len(elements) >= 5: i += 1 name = elements[0] sport = elements[1] # not using age which is elements[2] height = int(elements[3]) weight = int(elements[4]) #print("%s SPORT: %s hght: %i wt: %i" % (name, sport, height, weight)) self.data[name] = [height, weight] self.category[name] = sport else: print("SKIPPING %s" % line) print("%i entries loaded" % i) ################################################## ### ### ADDED CODE TO COMPUTE STANDARDIZED SCORE def getMedian(self, alist): if alist == []: return [] blist = sorted(alist) length = len(alist) if length % 2 == 1: # length of list is odd so return middle element return blist[int(((length + 1) / 2) - 1)] else: # length of list is even so compute midpoint v1 = blist[int(length / 2)] v2 =blist[(int(length / 2) - 1)] return (v1 + v2) / 2.0 def getAbsoluteStandardDeviation(self, alist, median): sum = 0 for item in alist: sum += abs(item - median) return sum / len(alist) def normalize(self, alist): median = self.getMedian(alist) print(median) asd = self.getAbsoluteStandardDeviation(alist, median) print(asd) for element in alist: print((element - median) / asd) def normalizeColumn(self, columnNumber): # first extract values to list col = [v[columnNumber - 1] for v in self.data.values()] print(col) median = self.getMedian(col) asd = self.getAbsoluteStandardDeviation(col, median) print("Median: %f ASD = %f" % (median, asd)) self.normalizeValues[columnNumber - 1] = (median, asd) for (k, v) in self.data.items(): tmp = self.data[k] #tmp[columnNumber] = (v - median) / asd self.data[k][columnNumber - 1] = (v[columnNumber - 1] - median) / asd def normalizeInstance(self, instanceVector): result = [] for i in range(len(instanceVector)): if i in self.normalizeValues: (median, asd) = self.normalizeValues[i] result.append((instanceVector[i] - median) / asd) else: result.append(instanceVector[i]) return result def manhattan(self, vector1, vector2): """Computes the Manhattan distance.""" distance = 0 total = 0 n = len(vector1) for i in range(n): distance += abs(vector1[i] - vector2[i]) total += 1 return distance def computeNearestNeighbor(self, itemName, itemVector): """creates a sorted list of items based on their distance to item""" distances = [] for otherItem in self.data: if otherItem != itemName: distance = self.manhattan(itemVector, self.data[otherItem]) distances.append((distance, otherItem)) # sort based on distance -- closest first distances.sort() return distances def classify(self, itemName, itemVector): """Classify the itemName based on user ratings Should really have items and users as parameters""" # first find nearest neighbor nearest = self.computeNearestNeighbor(itemName, self.normalizeInstance(itemVector))[0][1] rating = self.category[nearest] #print(itemVector) #print("Closest to %s" % nearest) #print("%s: %s" % (itemName, rating)) return rating def kNN(self, itemName, itemVector, k): # first find nearest neighbor tmp = {} resultSet = self.computeNearestNeighbor(itemName, self.normalizeInstance(itemVector))[:k] # the resultSet now contains the k nearest neighbors for res in resultSet: classification = self.category[res[1]] if classification in tmp: tmp[classification] += 1 else: tmp[classification] = 1 recommendations = list(tmp.items()) # recommendations is a list of classes and how many times # that class appeared in the nearest neighbor list (votes) # i.e. [['gymnastics', 2], ['track', 1]] recommendations.sort(key=lambda artistTuple: artistTuple[1], reverse = True) # construct list of classes that have the largest number of votes topRecommendations = list (filter(lambda k: k[1] == recommendations[0][1], recommendations)) # if only one class has the highest number of votes return that class if len(topRecommendations) == 1: rating = topRecommendations[0][0] else: rint = random.randint(0, len(topRecommendations) - 1) rating = topRecommendations[rint][0] return rating def eval(self, filename): """evaluate test set data""" f = open(filename) total = 0 correct = 0 for line in f: elements = line.split('\t') if len(elements) >= 5: total += 1 name = elements[0] sport = elements[1] # not using age which is elements[2] height = int(elements[3]) weight = int(elements[4]) classification = self.classify(name, (height, weight)) if classification == sport: print("%s CORRECT" % name) correct += 1 else: print("%s MISCLASSIFIED AS %s. Should be %s" % (name, classification, sport)) print("%f correct" % (correct / total)) def evalKNN(self, filename): """evaluate test set data""" f = open(filename) total = 0 correct = 0 for line in f: elements = line.split('\t') if len(elements) >= 5: total += 1 name = elements[0] sport = elements[1] # not using age which is elements[2] height = int(elements[3]) weight = int(elements[4]) #print("%s SPORT: %s hght: %i wt: %i" % (name, sport, height, weight)) classification = self.kNN(name, (height, weight), 3) if classification == sport: print("%s CORRECT" % name) correct += 1 else: print("%s MISCLASSIFIED AS %s. Should be %s" % (name, classification, sport)) print("%f correct" % (correct / total))