# coding:utf-8
"""
Spelling corrector library, used to correct common typos in domains like
gmal.com instead of gmail.com.

The spelling corrector uses difflib which in turn uses the
Ratcliff-Obershelp algorithm [1] to compute the similarity of two strings.
This is a very fast an accurate algorithm for domain spelling correction.

The (only) public method this module has is suggest(word), which given
a domain, suggests an alternative or returns the original domain
if no suggestion exists.

[1] http://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html
"""

import difflib


def suggest(word, cutoff=0.77):
    """
    Given a domain and a cutoff heuristic, suggest an alternative or return the
    original domain if no suggestion exists.
    """
    if word in LOOKUP_TABLE:
        return LOOKUP_TABLE[word]

    guess = difflib.get_close_matches(word, MOST_COMMON_DOMAINS, n=1, cutoff=cutoff)
    if guess and len(guess) > 0:
        return guess[0]
    return word

MOST_COMMON_DOMAINS = [
    # mailgun :)
    'mailgun.net',
    # The Top ~200 domains from 30mm open events gathered during the
    # last 30 days prior to 10/31/2018
    '163.com',
    'abv.bg',
    'account.tradeshift.com',
    'aim.com',
    'albany.edu',
    'alice.it',
    'amerihome.com',
    'andrew.cmu.edu',
    'aol.co.uk',
    'aol.com',
    'aol.de',
    'aol.fr',
    'arcor.de',
    'artifacturestudios.com',
    'asu.edu',
    'atproperties.com',
    'att.net',
    'austin.utexas.edu',
    'austincc.edu',
    'baylor.edu',
    'bellsouth.net',
    'berkeley.edu',
    'bigpond.com',
    'bigpond.net.au',
    'binghamton.edu',
    'bk.ru',
    'bluewin.ch',
    'blueyonder.co.uk',
    'bol.com.br',
    'btinternet.com',
    'bu.edu',
    'byui.edu',
    'calpoly.edu',
    'charter.net',
    'cloud.upwork.com',
    'colemanrg.com',
    'colorado.edu',
    'columbia.edu',
    'comcast.net',
    'compass.com',
    'cornell.edu',
    'cox.net',
    'coyote.csusb.edu',
    'cpp.edu',
    'crimson.ua.edu',
    'cytonn.com',
    'docomo.ne.jp',
    'du.edu',
    'earthlink.net',
    'email.arizona.edu',
    'email.sc.edu',
    'embarqmail.com',
    'emory.edu',
    'ezweb.ne.jp',
    'fiu.edu',
    'free.fr',
    'freenet.de',
    'frontier.com',
    'g.austincc.edu',
    'gmail.com',
    'gmx.at',
    'gmx.de',
    'gmx.net',
    'google.com',
    'googlemail.com',
    'guest.booking.com',
    'gwu.edu',
    'hawk.iit.edu',
    'home.nl',
    'hotmail.ca',
    'hotmail.co.uk',
    'hotmail.com',
    'hotmail.de',
    'hotmail.es',
    'hotmail.fr',
    'hotmail.it',
    'hotmail.se',
    'i.softbank.jp',
    'icloud.com',
    'iinet.net.au',
    'illinois.edu',
    'inbox.ru',
    'jhu.edu',
    'juno.com',
    'knights.ucf.edu',
    'kw.com',
    'laposte.net',
    'libero.it',
    'list.ru',
    'live.ca',
    'live.co.uk',
    'live.com',
    'live.com.au',
    'live.fr',
    'live.nl',
    'live.se',
    'lsu.edu',
    'mac.com',
    'mail.com',
    'mail.ru',
    'mail.usf.edu',
    'marketplace.amazon.co.uk',
    'marketplace.amazon.com',
    'marketplace.amazon.de',
    'masonlive.gmu.edu',
    'mavs.uta.edu',
    'me.com',
    'miami.edu',
    'msn.com',
    'msu.edu',
    'my.fsu.edu',
    'naver.com',
    'ntlworld.com',
    'ohio.edu',
    'online.no',
    'optonline.net',
    'optusnet.com.au',
    'orange.fr',
    'osu.edu',
    'outlook.com',
    'outlook.de',
    'outlook.es',
    'outlook.fr',
    'pace.edu',
    'pegipegi.com',
    'pitt.edu',
    'protonmail.com',
    'q.com',
    'qq.com',
    'rambler.ru',
    'rev.com',
    'roadrunner.com',
    'rocketmail.com',
    'rogers.com',
    'rollins.edu',
    'rutgers.edu',
    'savaari.com',
    'sbcglobal.net',
    'seznam.cz',
    'sfr.fr',
    'shaw.ca',
    'sky.com',
    'skynet.be',
    'spartans.ut.edu',
    'stanford.edu',
    'stjohns.edu',
    'stonybrook.edu',
    'student.gsu.edu',
    'suddenlink.net',
    'sympatico.ca',
    'syr.edu',
    't-online.de',
    'talktalk.net',
    'telenet.be',
    'telia.com',
    'telus.net',
    'temple.edu',
    'topper.wku.edu',
    'transfix.io',
    'twc.com',
    'txstate.edu',
    'u.northwestern.edu',
    'uci.edu',
    'ucr.edu',
    'ucsd.edu',
    'udel.edu',
    'uga.edu',
    'umail.ucsb.edu',
    'umich.edu',
    'umn.edu',
    'uol.com.br',
    'utexas.edu',
    'uw.edu',
    'uwm.edu',
    'vepl.com',
    'verizon.net',
    'videotron.ca',
    'virginia.edu',
    'vt.edu',
    'wanadoo.fr',
    'wayne.edu',
    'web.de',
    'wildcats.unh.edu',
    'windstream.net',
    'wisc.edu',
    'wp.pl',
    'xtra.co.nz',
    'yahoo.ca',
    'yahoo.co.in',
    'yahoo.co.jp',
    'yahoo.co.uk',
    'yahoo.com',
    'yahoo.com.ar',
    'yahoo.com.au',
    'yahoo.com.br',
    'yahoo.com.hk',
    'yahoo.com.mx',
    'yahoo.com.sg',
    'yahoo.com.tw',
    'yahoo.de',
    'yahoo.es',
    'yahoo.fr',
    'yahoo.it',
    'yandex.ru',
    'ymail.com',
    'ziggo.nl'
]

# domains that the corrector doesn't fix that we should fix
LOOKUP_TABLE = {
    u'yahoo':       u'yahoo.com',
    u'gmail':       u'gmail.com',
    u'hotmail':     u'hotmail.com',
    u'live':        u'live.com',
    u'outlook':     u'outlook.com',
    u'msn':         u'msn.com',
    u'googlemail':  u'googlemail.com',
    u'aol':         u'aol.com',
    u'aim':         u'aim.com',
    u'icloud':      u'icloud.com',
    u'me':          u'me.com',
    u'mac':         u'mac.com',
    u'facebook':    u'facebook.com',
    u'comcast':     u'comcast.net',
    u'sbcglobal':   u'sbcglobal.net',
    u'bellsouth':   u'bellsouth.net',
    u'verizon':     u'verizon.net',
    u'earthlink':   u'earthlink.net',
    u'cox':         u'cox.net',
    u'charter':     u'charter.net',
    u'shaw':        u'shaw.ca',
    u'bell':        u'bell.net'
}
