codepad
[
create a new paste
]
login
|
about
Language:
C
C++
D
Haskell
Lua
OCaml
PHP
Perl
Plain Text
Python
Ruby
Scheme
Tcl
#!/usr/bin/python # This is a Naive Bayes Classifier script # See http://en.wikipedia.org/wiki/Naive_Bayes_classifier # It is limited to a single Dependant Class Attribute # --- Released into Public Domain # --- Written by Shawn Paul Smith from csv import DictReader import sys import math # Returns a function that takes a value and returns its normal # distribution probability using given parameters. def normalProb(vari, avg): return lambda x: math.exp(-((float(x) - avg)**2)/(2*float(vari))) / math.sqrt(2*math.pi*float(vari)) # Import data from specified CSV or default if len(sys.argv) == 1: file = open("genderData.csv", "r") else: file = open(sys.argv[1], "r") datalist = [] try: for row in DictReader(file): datalist.append(row) finally: file.close() if (len(datalist) == 0 or None in datalist[1]): print "Failure to read data, exiting" quit() classes = {} # Data split into classes probFuncs = {} # Gaussian Probability functions for each attribute per class wanted = None # Dependant Class Attribute # find Dependant Class Variable while wanted not in datalist[1]: wanted = raw_input('What attribute is the Dependant Class Attribute: ') # Split datalist into classes for dataLine in datalist: if dataLine[wanted] in classes: classes[dataLine[wanted]].append(dataLine) else: classes[dataLine[wanted]] = [dataLine] # Find average for all other attributes for att in datalist[1].keys(): # attData is all data for that attribute in each class attData = {} for classType, classData in classes.iteritems(): attData[classType] = map(lambda d: d[att], classData) if raw_input('Is ' + att + ' categorical? (y/n): ').upper() in ['Y', 'YES']: print 'Not Yet Implemented' else: for classType, attPerClassData in attData.iteritems(): # Convert to floats dataAsFloats = map(lambda dStr: float(dStr), attPerClassData) # Find average and variance avg = sum(dataAsFloats)/len(dataAsFloats) vari = sum(map(lambda x: (x-avg)**2, dataAsFloats))/(len(dataAsFloats) - 1) # Store probability function if classType in probFuncs: probFuncs[classType][att] = normalProb(vari, avg) else: probFuncs[classType] = {att: normalProb(vari, avg)} # Holds the overall probability numerator for each class. sampleProbFactor = {} for classType in classes.keys(): #Initialize with simple class probability sampleProbFactor[classType] = float(len(classes[classType]))/float(len(datalist)) for att in datalist[1].keys(): if att != wanted: sample = float(raw_input('What is sample property -' + att + '-: ')) for classType in classes.keys(): sampleProbFactor[classType] *= probFuncs[classType][att](sample) # Report most likely class of sample print sampleProbFactor print 'Sample is most likely', max(sampleProbFactor, key=lambda y: y[1])
Private
[
?
]
Run code
Submit