#!/usr/bin/python
# This is a Naive Bayes Classifier script
# See http://en.wikipedia.org/wiki/Naive_Bayes_classifier
# It is limited to a single Dependant Class Attribute
# --- Released into Public Domain
# --- Written by Shawn Paul Smith
from csv import DictReader
import sys
import math
# Returns a function that takes a value and returns its normal
# distribution probability using given parameters.
def normalProb(vari, avg):
return lambda x: math.exp(-((float(x) - avg)**2)/(2*float(vari))) / math.sqrt(2*math.pi*float(vari))
# Import data from specified CSV or default
if len(sys.argv) == 1:
file = open("genderData.csv", "r")
else:
file = open(sys.argv[1], "r")
datalist = []
try:
for row in DictReader(file):
datalist.append(row)
finally:
file.close()
if (len(datalist) == 0 or None in datalist[1]):
print "Failure to read data, exiting"
quit()
classes = {} # Data split into classes
probFuncs = {} # Gaussian Probability functions for each attribute per class
wanted = None # Dependant Class Attribute
# find Dependant Class Variable
while wanted not in datalist[1]:
wanted = raw_input('What attribute is the Dependant Class Attribute: ')
# Split datalist into classes
for dataLine in datalist:
if dataLine[wanted] in classes:
classes[dataLine[wanted]].append(dataLine)
else:
classes[dataLine[wanted]] = [dataLine]
# Find average for all other attributes
for att in datalist[1].keys():
# attData is all data for that attribute in each class
attData = {}
for classType, classData in classes.iteritems():
attData[classType] = map(lambda d: d[att], classData)
if raw_input('Is ' + att + ' categorical? (y/n): ').upper() in ['Y', 'YES']:
print 'Not Yet Implemented'
else:
for classType, attPerClassData in attData.iteritems():
# Convert to floats
dataAsFloats = map(lambda dStr: float(dStr), attPerClassData)
# Find average and variance
avg = sum(dataAsFloats)/len(dataAsFloats)
vari = sum(map(lambda x: (x-avg)**2, dataAsFloats))/(len(dataAsFloats) - 1)
# Store probability function
if classType in probFuncs:
probFuncs[classType][att] = normalProb(vari, avg)
else:
probFuncs[classType] = {att: normalProb(vari, avg)}
# Holds the overall probability numerator for each class.
sampleProbFactor = {}
for classType in classes.keys():
#Initialize with simple class probability
sampleProbFactor[classType] = float(len(classes[classType]))/float(len(datalist))
for att in datalist[1].keys():
if att != wanted:
sample = float(raw_input('What is sample property -' + att + '-: '))
for classType in classes.keys():
sampleProbFactor[classType] *= probFuncs[classType][att](sample)
# Report most likely class of sample
print sampleProbFactor
print 'Sample is most likely', max(sampleProbFactor, key=lambda y: y[1])