#!/usr/bin/env python
import re
from time import sleep
from collections import defaultdict
import tweepy
from booksuggest import incdocs,update_corpus
from settings import username,password
WORD_REGEX=re.compile(r"^([\w]{4,})$")
class StreamWatcherListener(tweepy.StreamListener):
def on_status(self, status):
term_counts = count_terms(status)
if term_counts:
print repr(term_counts.keys())
update_corpus(term_counts)
incdocs()
def on_error(self, status_code):
print 'An error has occured! Status code = %s' % status_code
return True # keep stream alive
def on_timeout(self):
print 'Snoozing Zzzzzz'
def count_terms(doc):
term_counts = defaultdict(lambda:0.0)
if not doc:
return term_counts
terms = doc.text.split()
for term in terms:
# TODO: Stemming?
if WORD_REGEX.match(term):
term_counts[term.lower()] += 1
return term_counts
stream = tweepy.Stream(username, password, StreamWatcherListener(), timeout=15.0, retry_time=15.0, retry_count=100)
print 'Streaming timelines...'
stream.sample()