[ create a new paste ] login | about

Link: http://codepad.org/8OkD5AGM    [ raw code | fork ]

sub - Python, pasted on Dec 14:
#!/usr/bin/env python

import re
from time import sleep
from collections import defaultdict

import tweepy
from booksuggest import incdocs,update_corpus

from settings import username,password

WORD_REGEX=re.compile(r"^([\w]{4,})$")

class StreamWatcherListener(tweepy.StreamListener):

    def on_status(self, status):
        term_counts = count_terms(status)

        if term_counts:
            print repr(term_counts.keys())
            update_corpus(term_counts)
            incdocs()

    def on_error(self, status_code):
        print 'An error has occured! Status code = %s' % status_code
        return True # keep stream alive

    def on_timeout(self):
        print 'Snoozing Zzzzzz'


def count_terms(doc):
    term_counts = defaultdict(lambda:0.0)

    if not doc:
        return term_counts

    terms = doc.text.split()
    for term in terms:
        # TODO: Stemming?
        if WORD_REGEX.match(term):
            term_counts[term.lower()] += 1
    return term_counts


stream = tweepy.Stream(username, password, StreamWatcherListener(), timeout=15.0, retry_time=15.0, retry_count=100)

print 'Streaming timelines...'
stream.sample()


Create a new paste based on this one


Comments: