Repository: anfederico/Stockeye
Branch: master
Commit: 6f62a4ca431c
Files: 8
Total size: 15.2 KB

Directory structure:
gitextract_pw5dpfc3/

├── .travis.yml
├── README.md
├── bin/
│   └── stockeye-corpus
├── requirements.txt
└── stockeye/
    ├── __init__.py
    ├── symbols/
    │   ├── alpha.txt
    │   └── clean.txt
    └── watch.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .travis.yml
================================================
language: python
python:
    - "2.7"

before_install: sudo apt-get install -qq python-numpy python-scipy
install: pip install -r requirements.txt

script: echo "TODO"


================================================
FILE: README.md
================================================
## Deprecated

This project is no longer maintained.

================================================
FILE: bin/stockeye-corpus
================================================
python -m nltk.downloader stopwords
python -m nltk.downloader punkt

================================================
FILE: requirements.txt
================================================
beautifulsoup4>=4.3.2
bs4>=0.0.1
DateTime>=4.1.1
email>=4.0.2
lxml>=3.3.5
newspaper>=0.0.9.8
nltk>=3.2.2
numpy>=1.8.2
requests>=2.3.0

================================================
FILE: stockeye/__init__.py
================================================
from watch import watch


================================================
FILE: stockeye/symbols/alpha.txt
================================================
AfterHoursChangeRealtime
AnnualizedGain
Ask
AskRealtime
AverageDailyVolume
Bid
BidRealtime
BookValue
Change
ChangeFromFiftydayMovingAverage
ChangeFromTwoHundreddayMovingAverage
ChangeFromYearHigh
ChangeFromYearLow
ChangePercentRealtime
ChangeRealtime
ChangeinPercent
Commission
Currency
DaysHigh
DaysLow
DaysRange
DaysRangeRealtime
DaysValueChange
DaysValueChangeRealtime
DividendPayDate
DividendShare
DividendYield
EBITDA
EPSEstimateCurrentYear
EPSEstimateNextQuarter
EPSEstimateNextYear
EarningsShare
ExDividendDate
FiftydayMovingAverage
HighLimit
HoldingsGain
HoldingsGainPercent
HoldingsGainPercentRealtime
HoldingsGainRealtime
HoldingsValue
HoldingsValueRealtime
LastTradeDate
LastTradePriceOnly
LastTradeRealtimeWithTime
LastTradeTime
LastTradeWithTime
LowLimit
MarketCapRealtime
MarketCapitalization
MoreInfo
Name
Notes
OneyrTargetPrice
Open
OrderBookRealtime
PEGRatio
PERatio
PERatioRealtime
PercentChangeFromYearHigh
PercentChange
PercentChangeFromFiftydayMovingAverage
PercentChangeFromTwoHundreddayMovingAverage
PercentChangeFromYearLow
PreviousClose
PriceBook
PriceEPSEstimateCurrentYear
PriceEPSEstimateNextYear
PricePaid
PriceSales
SharesOwned
ShortRatio
StockExchange
Symbol
TickerTrend
TradeDate
TwoHundreddayMovingAverage
Volume
YearHigh
YearLow
YearRange

================================================
FILE: stockeye/symbols/clean.txt
================================================
After Hours Change Realtime
Annualized Gain
Ask
Ask Realtime
Average Daily Volume
Bid
Bid Realtime
Book Value
Change
Change From Fifty Day Moving Average
Change From Two Hundred Day Moving Average
Change From Year High
Change From Year Low
Change Percent Realtime
Change Realtime
Change in Percent
Commission
Currency
Days High
Days Low
Days Range
Days Range Realtime
Days Value Change
Days Value Change Realtime
Dividend Pay Date
Dividend Share
Dividend Yield
EBITDA
EPS Estimate Current Year
EPS Estimate Next Quarter
EPS Estimate Next Year
Earnings Share
Ex Dividend Date
Fifty Day Moving Average
High Limit
Holdings Gain
Holdings Gain Percent
Holdings Gain Percent Realtime
Holdings Gain Realtime
Holdings Value
Holdings Value Realtime
Last Trade Date
Last Trade Price Only
Last Trade Realtime With Time
Last Trade Time
Last Trade With Time
Low Limit
Market Cap Realtime
Market Capitalization
More Info
Name
Notes
Oneyr Target Price
Open
Order Book Realtime
PEG Ratio
PE Ratio
PE Ratio Realtime
Percent Change From Year High
Percent Change
Percent Change From Fifty Day Moving Average
Percent Change From Two Hundred Day Moving Average
Percent Change From Year Low
Previous Close
Price Book
Price EPS Estimate Current Year
Price EPS Estimate Next Year
Price Paid
Price Sales
Shares Owned
Short Ratio
Stock Exchange
Symbol
Ticker Trend
Trade Date
Two Hundred Day Moving Average
Volume
Year High
Year Low
Year Range

================================================
FILE: stockeye/watch.py
================================================
from requests  import get
from time      import sleep
from random    import randint
from newspaper import Article
from bs4       import BeautifulSoup
from re        import search, sub  
from datetime  import datetime, timedelta
from math      import log10 

from smtplib              import SMTP
from email.mime.multipart import MIMEMultipart
from email.mime.text      import MIMEText
from nltk.tokenize        import sent_tokenize, word_tokenize
from nltk.corpus          import stopwords
stopWords = set(stopwords.words('english'))

# --- Textrank Methods ---------------------------------------------------------

class vertex:
    order = 0
    def __init__(self, sentence_raw, sentence_processed, words):
        self.order              = vertex.order
        self.score              = None
        self.scores             = []
        self.sentence_raw       = sentence_raw
        self.sentence_processed = sentence_processed
        self.words              = words
        vertex.order += 1
        
    def averageScores(self):
        try: self.score = sum(self.scores)/len(self.scores)
        except ZeroDivisionError: self.score = 0

def overlap(w1, w2):
    s1 = []
    for w in w1:
        if w not in stopWords:
            s1.append(w)
    s2 = []
    for w in w2:
        if w not in stopWords:
            s2.append(w)
            
    try: return len([w for w in s1 if w in s2])/(log10(len(s1))+log10(len(s2)))
    except ZeroDivisionError: return 0
    
def buildGraph(text):
    vertices = [] 
    sentences = sent_tokenize(text, language='english')
    for sentence_raw in sentences:  
        sentence_processed = sub("[^a-zA-Z ]+", '', sentence_raw).lower()          
        words = word_tokenize(sentence_processed, language='english')
        vertices.append(vertex(sentence_raw, sentence_processed, words))
    
    for v1 in vertices:
        for v2 in vertices:
            if v1.order != v2.order:                
                v1.scores.append(overlap(v1.words, v2.words))
        v1.averageScores()
    return vertices

def summarize(text, length, firstlast = False):
    vertices = buildGraph(text)
    all_ord = sorted(vertices, key=lambda v: v.order)
    mos_sig = sorted(vertices, key=lambda v: v.score, reverse=True)[0:length]
    mos_sig_ord = sorted(mos_sig, key=lambda v: v.order)
        
    if firstlast:
        if all_ord[0] not in mos_sig_ord: 
            mos_sig_ord.insert(0, all_ord[0])
        if all_ord[len(all_ord)-1] not in mos_sig_ord:
            mos_sig_ord.append(all_ord[len(all_ord)-1])
    
    summary = []
    for v in mos_sig_ord:
        summary.append(v.sentence_raw)   
    return summary

# --- Yahoo Methods -----------------------------------------------------------

def loadSymbols():
    afile = open('symbols/alpha.txt', 'r')
    cfile = open('symbols/clean.txt', 'r') 
    alpha, clean = [], [] 
    for a in afile:
        alpha.append(a.strip('\n'))
    for c in cfile:
        clean.append(c.strip('\n'))
    symbols = {}
    for i in xrange(len(alpha)):
        symbols[alpha[i]] = clean[i]
    return symbols

def yahooURL(ticks):
    query = ''
    for i,t in enumerate(ticks):
        if i == len(ticks)-1: query += '%22'+t+'%22'
        else: query += '%22'+t+'%22%2C'
    return "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20("+query+")%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env&callback="

def yahooRequest(url, moreProperties = []):
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
    r = get(url, headers=headers)
    j = r.json()
    try: 
        quotes = j['query']['results']['quote']      
    except KeyError:                         
        print "No Stocks Found!"             # If zero stocks found
        return 
    stocks = {}
    properties = ['Name'] + moreProperties       
    if type(quotes) == dict:                 # If one stock found
        stocks[quotes['Symbol']] = {}
        for p in properties:
            try:
                stocks[quotes['Symbol']][p] = quotes[p]
            except KeyError:
                stocks[quotes['Symbol']][p] = "None"        
    else:
        for q in quotes:                     # If multiple stocks found
            stocks[q['Symbol']] = {}
            for p in properties:
                try:
                    stocks[q['Symbol']][p] = q[p]
                except KeyError:
                    stocks[q['Symbol']][p] = "None"
    return stocks

# --- Email Methods ------------------------------------------------------------

def stats_HTML(symbol, statistics, properties):
    symbols = loadSymbols()
    stats = '<center><b>'+symbol+'</b><br><br><table>'
    for p in properties:
        try:
            stats += '<tr><td style="padding-right:30px">'+symbols[p]+'</td>'
        except:
            stats += '<tr><td>'+p+'</td>'
        stats += '<td>'+str(statistics[symbol][p])+'</td></tr>'
    return stats+'</table><br><hr><br></center>'

def outline_HTML(i, title, link, time, summary):
    title_HTML = '<br>'+str(i+1)+'. <b><a href="'+link+'">'+title+'</a></b><br>' 
    time_HTML = 'Posted '+time+'<br>'
    summary_HTML = ''    
    for sentence in summary:
        summary_HTML += '<br><i>'+sentence+'<br></i>'
    return title_HTML+time_HTML+summary_HTML

def subject_HTML(symbol):
    subject = 'Recent News Activity for '+symbol
    return subject

def body_HTML(symbol, statistics, properties, articles):
    body = ''
    body += stats_HTML(symbol, statistics, properties)
    for i, a in enumerate(articles):
        body += outline_HTML(i, a.title, a.link, a.time, a.summary)
    return body    

def sendEmail(subject, body, credentials):    
    self = credentials[0]
    password = credentials[1]    
    fromAddr = credentials[2]
    toAddr = credentials[3]   
    msg = MIMEMultipart()
    msg['From'] = fromAddr
    msg['To'] = toAddr
    msg['Subject'] = subject   
    msgText = MIMEText(body, 'html', 'UTF-8')
    msg.attach(msgText)
    server = SMTP('smtp.gmail.com', 587)
    server.starttls()
    server.login(self, password)
    text = msg.as_string()
    server.sendmail(fromAddr, toAddr, text)
    server.quit()

# --- Scraping Methods ---------------------------------------------------------

class article:    
    def __init__(self, title, link, time):   
        self.title   = title
        self.link    = link
        self.time    = time
        self.order   = None
        self.body    = []
        self.summary = []
        
    def printTitle(self):
        print self.title
        
    def printBody(self):
        for s in self.body:
            print '  ',
            print s
            print
        
    def printSummary(self):
        for s in self.summary:
            print s
            print

def similarity(s1, s2):
    if len(s1) == 0: return len(s2)
    elif len(s2) == 0: return len(s1)
    v0 = [None]*(len(s2) + 1)
    v1 = [None]*(len(s2) + 1)
    for i in range(len(v0)):
        v0[i] = i
    for i in range(len(s1)):
        v1[0] = i + 1
        for j in range(len(s2)):
            cost = 0 if s1[i] == s2[j] else 1
            v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost)
        for j in range(len(v0)):
            v0[j] = v1[j]
    return 100-((float(v1[len(s2)])/(len(s1)+len(s2)))*100)

def unique(title, articles):
    for article in articles:
        if similarity(title, article.title) >= 95:
            return False
    return True

def createURLs(query, pages):
    pages = (10 * x for x in xrange(0, pages))
    lower = query.lower().replace(' ', '+')
    urls = ['https://www.google.com/search?q="%s"&tbm=nws&tbs=qdr:y#q="%s"&safe=active&tbs=qdr:y,sbd:1&tbm=nws&start=%s' % (lower, lower, x) for i, x in enumerate(pages)]
    return urls
  
def grabArticles(query, pages, rest = 0):
    urls = createURLs(query, pages)
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
    articles = []
    for url in urls:
        response = get(url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser") 
        objects_HId = soup.find_all("a", class_="l._HId")
        objects_sQb = soup.find_all("a", class_="_sQb")
        
        for a in objects_HId:
            title = a.get_text()
            link = a['href']
            try:
                time = a.parent.find("span", class_="_uQb").text
            except AttributeError: 
                time = a.parent.parent.find("span", class_="_uQb").text
            if unique(title, articles):
                articles.append(article(title, link, time))

        for a in objects_sQb:
            title =  a.get_text()
            link = a['href']
            try:
                time = a.parent.find("span", class_="_uQb").text
            except AttributeError:
                time = a.parent.parent.find("span", class_="_uQb").text
            if unique(title, articles):
                articles.append(article(title, link, time))
               
        sleep(randint(float(rest)/2, rest))
    return articles   

# ----- Analytical Methods -----------------------------------------------------

def summarizeArticles(articles, length, firstlast = False):
    summedArticles = []
    for a in articles:
        try: 
            A = Article(a.link)
            A.download()
            A.parse()
            text = ""
            paragraphs = A.text.split('\n')
            for p in paragraphs:
                if len(p) > 100:
                    a.body.append(p)
                    text += p + ' ' 
            sentences = summarize(text, length, firstlast)
            for s in sentences:
                a.summary.append(s) 
            summedArticles.append(a)    
        except: pass
    return summedArticles

def sortArticles(articles):
    for a in articles:    
        time = a.time  
        if search("second", time):
            seconds = int(time.split(' ')[0])
            order = datetime.now()-timedelta(seconds=seconds)        
        elif search("minute", time):
            minutes = int(time.split(' ')[0])
            order = datetime.now()-timedelta(minutes=minutes)           
        elif search("hour", time):
            hours = int(time.split(' ')[0])
            order = datetime.now()-timedelta(hours=hours)        
        else:
            order = datetime.strptime(time, '%b %d, %Y') 
        a.order = order
    return sorted(articles, key=lambda a: a.order, reverse=True)     

# ----- The Mastermind ---------------------------------------------------------

def watch(credentials, ticks, properties = [], threshold = 5, hourspast = 18, sentences = 3, firstlast = False):
    if threshold <= 0:
        print "Please choose a threshold greater than 0."
        return         
    if hourspast < 0:
        print "This program is not capable of scraping news from the future."
        return  
    if len(ticks) > 100:
        print "API calls are limited to 100 individual stocks."
        return   
    
    estimate = len(ticks)*15*2
    if estimate < 60: print "This run will take approximately %s seconds" % (str(estimate))
    else: print "This run will take approximately %s minutes" % (str(estimate/60))    
    
    url = yahooURL(ticks)
    stats = yahooRequest(url, properties)
    remove = ['class', 'common', 'stock']
    for symbol in stats:
        name = stats[symbol]['Name']
        if name:
            print "Finding news for %s" % (symbol)
            query = (' '.join([w for w in name.split() if w.lower() not in remove]))+' '+symbol
            articles = grabArticles(query, 2, 20)
            articles = summarizeArticles(articles, sentences, firstlast)
            articles = sortArticles(articles) 

            recentArticles = []
            for a in articles:
                hoursago = float((datetime.now()-a.order).total_seconds())/3600
                if hoursago <= hourspast:
                    recentArticles.append(a)    
            if len(recentArticles) >= threshold:
                subject = subject_HTML(symbol)
                body = body_HTML(symbol, stats, properties, recentArticles)
                sendEmail(subject, body, credentials)
        else:
            print "Coudn't find any company for %s" % (symbol)