User:Jitse's bot/goim.py

# This is a bot which updates various reports of interest to
# WikiProject Mathematics on the English Wikipedia. It runs daily
# under the account [[User:Jitse's bot]].
#
# Written and run by [[User:Jitse Niesen]].
#
# This code is a mess and I know it is a mess, but I don't care very
# much. The bot only writes to three pages:
#  * [[Wikipedia:WikiProject Mathematics/Current activity]]
#  * [[Wikipedia:Pages needing attention/Mathematics/Lists]]
#  * [[Wikipedia:WikiProject Mathematics/Count]]
# Any malfunctioning will thus be contained within those three pages
# and within the maths WikiProject, which will gladly clean up any
# mess it creates ;) . That being said, if somebody is willing to
# rewrite the code, go ahead!

import datetime, os, pickle, re, sys, time
from pprint import PrettyPrinter

sys.path.append(os.getcwd())
import catlib, wikipedia

site = wikipedia.getSite()

noload = False                          # do not update data
nowrite = False                         # do not write results

data = None                             # global variable containing all data
tempdata = {}                           # data that does not need to be saved
subject = 'daily update'

class GoimData:
    """All data is stored in a global variable of this type."""
    def __init__(self):
        self.all = GoimDiff()           # all maths articles
        self.att = GoimDiff()           # articles needing attention
        self.cat = GoimDiff()           # maths categories
        self.clar = GoimDiff()          # articles needing clearification
        self.clean = GoimDiff()         # articles needing clean-up
        self.con = GoimDiff()           # articles needing context
        self.disp = GoimDiff()          # accuracy disputes
        self.exp = GoimDiff()           # articles needing expert attention
        self.fac = GoimDiff()           # featured article candidates
        self.far = GoimDiff()           # featured article review
        self.farc = GoimDiff()          # featured article removal candidates
        self.gac = GoimDiff()           # good article candidates
        self.gar = GoimDiff()           # good article review
        self.imp = GoimDiff()           # needing importance to be explained
        self.macr = GoimDiff()          # maths a-class rating candidate
        self.merge = GoimDiff()         # articles to be merged
        self.misc = GoimDiff()          # articles in misc clean-up cats
        self.morefs = GoimDiff()        # articles needing additional refs
        self.norelref = GoimDiff()      # articles lacking reliable refs
        self.orig = GoimDiff()          # original research
        self.orph = GoimDiff()          # orphaned articles
        self.pr = GoimDiff()            # articles on peer review
        self.prop = GoimDiff()          # proposed deletion
        self.req = GoimDiff()           # requested articles
        self.rewr = GoimDiff()          # articles needing a rewrite
        self.rqIm = GoimDiff()          # requested images
        self.split = GoimDiff()         # articles to be split
        self.spr = GoimDiff()           # articles on scientific peer review
        self.tech = GoimDiff()          # too technical
        self.tone = GoimDiff()          # inappropriate tone
        self.unref = GoimDiff()         # articles lacking sources
        self.unsrc = GoimDiff()         # articles with unsourced statements
        self.vfy = GoimDiff()           # articles needing verification
        self.wfy = GoimDiff()           # articles needing wikification
        self.vfd = {}                   # listed on VfD


class GoimDiff:
    """Lists of articles plus last seven diffs."""
    def __init__(self):
        self.cur = None                 # current list
        self.curtime = None             # timestamp of 'cur'
        self.added = []                 # list of lists of added files
        self.rmvd = []                  # list of lists of removed files
        self.times = []                 # list of timestamps

    def update(self, new):
        try:
            old = self.cur
        except AttributeError:
            old = None
            
        new.sort()
        new = removeDuplicates(new)

        if old != None:
            added = [ l for l in new if not l in old ]
            rmvd = [ l for l in old if not l in new ]
            if added or rmvd:
                try:
                    (self.added, self.rmvd, self.times)
                except AttributeError:
                    self.added = []
                    self.rmvd = []
                    self.times = []
                self.added = [added] + self.added[:6]
                self.rmvd = [rmvd] + self.rmvd[:6]
                self.times = [self.curtime] + self.times[:6]
                self.cur = new
                self.curtime = datetime.datetime.utcnow()
            return (len(added), len(rmvd))
        else:
            self.cur = new
            self.curtime = datetime.datetime.utcnow()
            return (len(new),0)

    def makeList(self, rmvd = False, num = 7, percOf = None, total = True, prefix = None):
        """Make a list of articles added.
        rmvd: if True, list articles removed instead.
        num: number of items in the list.
        total: display total number, unless rmvd is True.
        percOf: if not None, write percentage of percOf.
        prefix: if not None, prefix to be added in front of article name.
        """
        
        res = ''
        if rmvd:
            l = self.rmvd
        else:
            l = self.added

        for i in range(min(len(l), num)):
            if l[i]:
                if i == 0:
                    t = self.curtime
                else:
                    t = self.times[i-1]
                t = (t-datetime.timedelta(0.75)).strftime('%d %b')
                t = t.lstrip('0')
                x = []
                for a in l[i]:
                    if prefix:
                        atxt = prefix + a
                    else:
                        atxt = a
                    if a in self.cur or rmvd:
                        x.append('[[' + atxt + ']]')
                    else:
                        x.append('<s>[[' + atxt + ']]</s>')
                x = ', '.join(x)
                res += "* '''%s''': %s.\n" % (t,x)

        if not rmvd and total:
            if percOf:
                res += "* '''Total''': %d (%.2f%%).\n" \
                       % (len(self.cur), float(len(self.cur))/percOf*100)
            else:    
                res += "* '''Total''': %d.\n" % (len(self.cur),)

        if not res:
            res = 'No news.\n'

        return res
    

def list2wiki0(list):
    l = [ '* [[%s]]\n' % a for a in list ]
    return ''.join(l)


def list2wiki(list):
    if not list:
        return '* None\n';
    list.sort()
    list = removeDuplicates(list)
    n = len(list)
    if n <= 9:
        res = list2wiki0(list)
    else:
        res = '{| style="background-color: transparent; width: 100%"\n' \
              + '| valign="top" |\n' \
              + list2wiki0(list[:(n+2)/3]) \
              + '| valign="top" |\n' \
              + list2wiki0(list[(n+2)/3:(2*n+2)/3]) \
              + '| valign="top" |\n' \
              + list2wiki0(list[(2*n+2)/3:]) \
              + '|}\n' 
    return res


# This function is not used
def studyCatTree(root, cats, catsRec, ignore, arts):
    """Traverses the category subtree starting at _root_ and looks for
    articles from the list _arts_. Returns a dictionary D such that:
    * for all c in cats, D[c] is a list of all articles from _arts_
      found in the category c;
    * for all c in catsRec, D[c] is a list of all articles from _arts_
      found in the category c or one of its subcategories;
    * D['misc'] is a list of tuples (article, category) containing all
      articles from _arts_ found elsewhere in the subtree.
    Categories in _ignore_ and their subcategories are ignored."""

    todo = [root]
    done = []
    D = {}
    D['misc'] = []
    while todo:
        #print todo
        catname = todo.pop(0)
        if catname in ignore or catname in done:
            continue
        c = catlib.Category(site, catname)
        if catname in catsRec:
            a = c.articles(True)
        else:
            subcats = c.subcategories()
            titles = [ x.titleWithoutNamespace() for x in subcats ]
            todo.extend([ x for x in titles
                          if not x in done and x != catname ])
            a = c.articles(False)
        a = [ x.titleWithoutNamespace() for x in a
              if ((x.namespace() == 0 or x.namespace() == 1)
                  and x.titleWithoutNamespace() in arts) ]
        if catname in cats or catname in catsRec:
            D[catname] = a
        else:
            D['misc'].extend([(x,catname) for x in a])
    D['misc'].sort()
    return D
            
    
def removeDuplicates(list):
    """Removes duplicates in a sorted list"""
    res = []
    for x in list:
        if not x in res:
            res.append(x)
    return res


def getLinks(txt):
    """Only returns links to pages in main namespace in this Wikipedia"""
    Rlink = re.compile(r'\[\[(?P<title>[^:\]\|]*)(?:\|[^\]\|]*)?\]\]')
    links = Rlink.findall(txt)
    Rlink = re.compile(r'\{\{arttalk\|(?P<title>[^:\}]*)\}\}')
    links += Rlink.findall(txt)
    links = [ link[0].upper() + link[1:] for link in links ]
    links.sort()
    return removeDuplicates(links)

    
def removeLines(txt, s1, s2):
    """Remove those lines from 'txt' which start with 's1' and end
    with 's2'."""
    lines = txt.split('\n')
    lines = [ l for l in lines if not (l.startswith(s1) and l.endswith(s2)) ]
    return '\n'.join(lines)


def getSection(txt, title):
    """Get a section (of arbitrary level) with the given title."""

    lines = txt.split('\n')
    start = None
    for i in range(len(lines)):
        line = lines[i]
        k = 0
        while len(line) > k and line[k] == '=':
            k = k+1
        if start and k == level:
            return '\n'.join(lines[start+1:i])
        if k > 1:
            x = line[k:].strip()
            if x.startswith(title) and x[len(title):].strip() == k * '=':
                start = i
                level = k
    print 'WARNING: getSection() could not find "%s"\n' % (title,)
    

def getPage(title):
    """Get a page from Wikipedia."""

    page = wikipedia.Page(site, title)
    txt = page.get()
    return txt


def getRX_TX():
    data = os.popen('/sbin/ifconfig eth0').read()
    rx = int(re.search(r'RX bytes:(\d+)\D', data).group(1))
    tx = int(re.search(r'TX bytes:(\d+)\D', data).group(1))
    return (rx,tx)

def initProgressReport():
    now = datetime.datetime.now()
    tempdata['starttime'] = now
    tempdata['lasttime'] = now
    (rx,tx) = getRX_TX()
    tempdata['startrx'] = rx
    tempdata['lastrx'] = rx
    tempdata['starttx'] = tx
    tempdata['lasttx'] = tx
    print "Info: initProgressReport() time is %s, RX = %d, TX = %d" \
          % (now.time().strftime('%H:%M:%S'), rx, tx)


def formatTD(td):
    s = td.seconds % 60
    m = (td.seconds / 60) % 60
    h = td.seconds / 3600
    if h == 0:
        return "%d:%02d" % (m,s)
    else:
        return "%dh%02d:%02d" % (h,m,s)

    
def progressReport():
    now = datetime.datetime.now()
    passed = now - tempdata['lasttime']
    passedcum = now - tempdata['starttime']
    tempdata['lasttime'] = now
    (rxnow,txnow) = getRX_TX()
    rx = rxnow - tempdata['lastrx']
    rxcum = rxnow - tempdata['startrx']
    tempdata['lastrx'] = rxnow
    tx = txnow - tempdata['lasttx']
    txcum = txnow - tempdata['starttx']
    tempdata['lasttx'] = txnow
    mil = 1000000.0
    print "Info: time %s (cum %s), RX %.2f MB, TX %.2f MB (cum %.2f/%.2f)" \
          % (formatTD(passed), formatTD(passedcum),
             rx/mil, tx/mil, rxcum/mil, txcum/mil)


def replaceStanza(page, title, txt):
    cmt = '<!--jngoim:%s-->' % (title,)
    try:
        i1 = page.index(cmt)
        i2 = page.index(cmt, i1+1)
        if txt.endswith('\n'):
            txt = '\n' + txt
            page = page[:i1+len(cmt)] + txt + page[i2:]
    except ValueError:
        print "Warning: %s not found" % cmt
    return page


def summarizeFeatured0(diff, txt):
    res = []
    for i in range(len(diff.added)):
        if i == 0:
            t = diff.curtime
        else:
            t = diff.times[i-1]
        for a in diff.added[i]:
            if a in diff.cur:
                res += [(t, txt(a))]
            else:
                res += [(t, '<s>' + txt(a) + '</s>')]
    return res

    
def summarizeFeatured():
    lst = [];
    txt = lambda x: ('[[%s]] is candidate to become a Featured Article '
                     + '([[Wikipedia:Featured article candidates/%s|discussion]])') % (x,x)
    lst = summarizeFeatured0(data.fac, txt)
    txt = lambda x: ('The Featured Article status of [[%s]] is under review '
                     + '([[Wikipedia:Featured article review/%s|discussion]])') % (x,x)
    lst.extend(summarizeFeatured0(data.far, txt))
    txt = lambda x: ('[[%s]] is nominated to have its Featured Article status removed '
                     + '([[Wikipedia:Featured article review/%s|discussion]])') % (x,x)
    lst.extend(summarizeFeatured0(data.farc, txt))
    txt = lambda x: ('A [[Wikipedia:WikiProject Mathematics/A-class rating/%s|discussion]] '
                     + 'has been started on whether [[%s]] should be graded as A-class quality') % (x,x)
    lst.extend(summarizeFeatured0(data.macr, txt))
    txt = lambda x: ('[[%s]] is undergoing Peer Review '
                     + '([[Wikipedia:Peer review/%s|discussion]])') % (x,x)
    lst.extend(summarizeFeatured0(data.pr, txt))
    txt = lambda x: ('[[%s]] is undergoing Scientific Peer Review '
                     + '([[Wikipedia:Scientific peer review/%s|discussion]])') % (x,x)
    lst.extend(summarizeFeatured0(data.spr, txt))
    txt = lambda x: ('[[%s]] is candidate to become a Good Article '
                     + '(see [[Wikipedia:Good article candidates]])') % x
    lst.extend(summarizeFeatured0(data.gac, txt))
    txt = lambda x: ('The Good Article status of [[%s]] is under review '
                     + '([[Wikipedia:Good articles/Review#%s|discussion]])') % (x,x)
    lst.extend(summarizeFeatured0(data.gar, txt))
    lst.sort(key = lambda x: x[0], reverse = True) # sort on dates
    res = '';
    for i in range(len(lst)):
        if i == 6:
            cutoff = lst[i][0]
        # Don't list struck through items more 18h older than the 7th item
        if (i <= 6 or cutoff-lst[i][0] < datetime.timedelta(0.75) 
            or not lst[i][1].startswith('<s>')):
            t = (lst[i][0]-datetime.timedelta(0.75)).strftime('%d %b').lstrip('0')
            res += ("* '''%s''': %s.\n" % (t,lst[i][1]))
    if not res:
        res = 'None.\n'
    return res

    
def summarizeVfd(vfd):
    res = ''
    dates = vfd.keys()
    dates.sort()
    dates.reverse()
    for d in dates:
        l1 = [ a for a in vfd[d] if a in data.all.cur ]
        l2 = [ a for a in vfd[d] if (a.endswith('nomination)') and a.rfind('(') != -1
                                     and a[:a.rfind('(')-1] in data.all.cur) ]
        if l1 or l2:
            x1 = [ ( '[[%s]] ([[Wikipedia:Articles for deletion/%s' +
                    '|discussion]])' )
                  % (a,a) for a in l1 ]
            x2 = [ ( '[[%s]] ([[Wikipedia:Articles for deletion/%s' +
                    '|discussion]])' )
                  % (a[:a.rfind('(')-1], a) for a in l2 ]
            x = ', '.join(x1 + x2)
            t = d.strftime('%d %b').lstrip('0')
            res += ("* '''%s''': %s.\n" % (t,x))
    if not res:
        res = 'None.\n'
    return res


def writeGoim():
    page = wikipedia.Page(site, 'Wikipedia:WikiProject Mathematics/Current activity')
    txt = page.get()

    txt = replaceStanza(txt, 'requested', data.req.makeList())
    txt = replaceStanza(txt, 'attention', data.att.makeList())
    txt = replaceStanza(txt, 'reqImages', data.rqIm.makeList())
    txt = replaceStanza(txt, 'onVfd', summarizeVfd(data.vfd))
    txt = replaceStanza(txt, 'onCfD',
                        data.cfd.makeList(prefix = ":Category:", total = False))
    txt = replaceStanza(txt, 'prop', data.prop.makeList(total = False))
    txt = replaceStanza(txt, 'newArticles', data.all.makeList())
    txt = replaceStanza(txt, 'rmvdArticles', data.all.makeList(True))
    txt = replaceStanza(txt, 'context', data.con.makeList())
    txt = replaceStanza(txt, 'cleanup', data.clean.makeList())
    txt = replaceStanza(txt, 'verify', data.vfy.makeList())
    txt = replaceStanza(txt, 'expert', data.exp.makeList())
    txt = replaceStanza(txt, 'technical', data.tech.makeList())
    txt = replaceStanza(txt, 'wikify', data.wfy.makeList())
    txt = replaceStanza(txt, 'featured', summarizeFeatured())

    if not nowrite:
        page.put(txt, subject)
        file('goim/goim.out', 'w').write(txt.encode('utf-8'))
        print "Info: writeGoim() finished at %s" \
              % (datetime.datetime.now().time(),)
    else:
        print txt.encode('utf-8')
    
    
def updatePortal():
    page = wikipedia.Page(site, 'Wikipedia:WikiProject Mathematics/Count')
    page.put(str(len(data.all.cur)), subject, watchArticle=True)
    print "Info: updatePortal() finished"
    progressReport()
    

def updateScript():
    scriptpage = wikipedia.Page(site, "User:Jitse's bot/goim.py")
    text = file('goim/goim.py').read()
    text = '<pre><nowiki>\n' + text + '</no' + 'wiki></p' + 're>' # Split to confuse MW parser
    if scriptpage.get() != text:
        scriptpage.put(text, subject)
    print "Info: updateScript() finished"
    progressReport()


def updateLists():
    pn = 'Wikipedia:Pages needing attention/Mathematics/Lists'
    page = wikipedia.Page(site, pn)
    txt = page.get()
    txt = replaceStanza(txt, 'verify',       list2wiki(data.vfy.cur))
    txt = replaceStanza(txt, 'expert',       list2wiki(data.exp.cur))
    # {{attention}} is merged with cleanup
    #txt = replaceStanza(txt, 'attention',    list2wiki(tempdata['att']))
    txt = replaceStanza(txt, 'wikify',       list2wiki(data.wfy.cur))
    txt = replaceStanza(txt, 'context',      list2wiki(data.con.cur))
    txt = replaceStanza(txt, 'reqImages',    list2wiki(data.rqIm.cur))
    txt = replaceStanza(txt, 'technical',    list2wiki(data.tech.cur))
    txt = replaceStanza(txt, 'cleanup',      list2wiki(data.clean.cur))
    txt = replaceStanza(txt, 'merge',        list2wiki(data.merge.cur))
    txt = replaceStanza(txt, 'disputed',     list2wiki(data.disp.cur))
    txt = replaceStanza(txt, 'unreferenced', list2wiki(data.unref.cur))
    txt = replaceStanza(txt, 'norelref',     list2wiki(data.norelref.cur))
    txt = replaceStanza(txt, 'morerefs',     list2wiki(data.morefs.cur))
    txt = replaceStanza(txt, 'unsourced',    list2wiki(data.unsrc.cur))
    txt = replaceStanza(txt, 'split',        list2wiki(data.split.cur))
    txt = replaceStanza(txt, 'clarification',list2wiki(data.clar.cur))
    txt = replaceStanza(txt, 'rewrite',      list2wiki(data.rewr.cur))
    txt = replaceStanza(txt, 'importance',   list2wiki(data.imp.cur))
    txt = replaceStanza(txt, 'tone',         list2wiki(data.tone.cur))
    txt = replaceStanza(txt, 'original',     list2wiki(data.orig.cur))
    txt = replaceStanza(txt, 'orphaned',     list2wiki(data.orph.cur))

    l = [ '* [[%s]] is in [[:Category:%s]]\n' % a for a in data.misc.cur ]
    txt = replaceStanza(txt, 'misc',         ''.join(l))

    if not nowrite:
        page.put(txt, subject + ' (phase %d)' % (datetime.datetime.now().toordinal() % 4))
        print "Info: updateLists() finished"
        progressReport()
    else:
        print '-' * 70
        print txt.encode('utf-8')

    
def listedOnVfd():
    if not 'vfd' in dir(data):
        data.vfd = {}
    newvfd = {}
    d = datetime.date.today()
    for i in range(10):
        d = d - datetime.timedelta(1)   # subtract one day
        if d in data.vfd:
            newvfd[d] = data.vfd[d]
        else:
            ds = d.strftime('%Y %B %d')
            ds = ds.replace(' 0', ' ')      # 'June 05' -> 'June 5'
            ttl = 'Wikipedia:Articles for deletion/Log/' + ds
            try:
               txt = getPage(ttl)
            except wikipedia.IsRedirectPage:
               ttl = 'Wikipedia:Votes for deletion/Log/' + ds
               txt = getPage(ttl)
            R1 = re.compile(r'\{\{Wikipedia:Votes for deletion/([^\}]*)\}\}')
            newvfd[d] = R1.findall(txt)
            R1 = re.compile(r'\{\{Wikipedia:Pages for deletion/([^\}]*)\}\}')
            newvfd[d] += R1.findall(txt)
            R1 = re.compile(r'\{\{Wikipedia:Articles for deletion/([^\}]*)\}\}')
            newvfd[d] += R1.findall(txt)
            print "Info: listedOnVfd() found %d for %s" \
                  % (len(newvfd[d]), ds)
            newvfd[d] = map(lambda x: x.replace('_', ' '), newvfd[d])
    data.vfd = newvfd
    print "Info: listedOnVfd() finished"
    progressReport()
   

def listedOnCfd():
    if not 'cfd' in dir(data) or isinstance(data.cfd, dict):
        data.cfd = GoimDiff()
        data.cfd.update([])
    cats = [];
    cfdcat = catlib.Category(site, "Category:Categories for discussion")
    for cfdsubcat in cfdcat.subcategories():
        cs = cfdsubcat.subcategories()
        cs = [ c.titleWithoutNamespace() for c in cs ]
        cs = [ c for c in cs if c in data.cat.cur ]
        cats.extend(cs)
    res = data.cfd.update(cats)
    print "Info: listedOnCfd() added %d, removed %d" % res
    progressReport()
    
        
def harvestFAC():
    if not 'fac' in dir(data):
        data.fac = GoimDiff()
        data.fac.update([])
    ttl = 'Wikipedia:Featured article candidates'
    txt = getPage(ttl)
    R1 = re.compile(r'\{\{' + ttl + r'/([^\}]*)\}\}')
    lst = R1.findall(txt)
    print "Info: Found %d articles on FAC" % len(lst)
    lst2 = [ x for x in lst if x in data.all.cur ]
    lst2.sort()
    res = data.fac.update(removeDuplicates(lst2))
    if not res == (0,0):
        print "Info: harvestFAC() added %d, removed %d" % res


def harvestFAR():
    if not 'far' in dir(data):
        data.far = GoimDiff()
        data.far.update([])
    if not 'farc' in dir(data):
        data.farc = GoimDiff()
        data.farc.update([])
    ttl = 'Wikipedia:Featured article review'
    R1 = re.compile(r'\{\{' + ttl + r'/([^\}]*)\}\}')
    txt = getPage(ttl)
    i = txt.index('==Featured article removal candidates==')
    lst = R1.findall(txt[:i])
    print "Info: Found %d articles on FAR" % len(lst)
    lst2 = [ x for x in lst if x in data.all.cur ]
    lst2.sort()
    res = data.far.update(removeDuplicates(lst2))
    if not res == (0,0):
        print "Info: harvestFAR() added %d, removed %d to FAR" % res
    lst = R1.findall(txt[i:])
    print "Info: Found %d articles on FARC" % len(lst)
    lst2 = [ x for x in lst if x in data.all.cur ]
    lst2.sort()
    res = data.farc.update(removeDuplicates(lst2))
    if not res == (0,0):
        print "Info: harvestFAR() added %d, removed %d to FARC" % res


def harvestGAC():
    if not 'gac' in dir(data):
        data.gac = GoimDiff()
        data.gac.update([])
    ttl = 'Wikipedia:Good article nominations'
    txt = getPage(ttl)
    R1 = re.compile(r'\{\{article\|([^\}]*)\}\}')
    lst = R1.findall(txt)
    R1 = re.compile(r'\{\{la\|([^\}]*)\}\}')
    lst.extend(R1.findall(txt))
    print "Info: Found %d articles on GAC" % len(lst)
    lst2 = [ x for x in lst if x in data.all.cur ]
    lst2.sort()
    res = data.gac.update(removeDuplicates(lst2))
    if not res == (0,0):
        print "Info: harvestGAC() added %d, removed %d" % res


def harvestGAR():
    if not 'gar' in dir(data):
        data.gar = GoimDiff()
        data.gar.update([])
    ttl = 'Wikipedia:Good article reassessment'
    txt = getPage(ttl)
    R1 = re.compile(r'===\s*\[\[([^\]]*)\]\]\s*===')
    lst = R1.findall(txt)
    print "Info: Found %d articles on GAR" % len(lst)
    lst2 = [ x for x in lst if x in data.all.cur ]
    lst2.sort()
    res = data.gar.update(removeDuplicates(lst2))
    if not res == (0,0):
        print "Info: harvestGAR() added %d, removed %d" % res


def harvestPR():
    if not 'pr' in dir(data):
        data.pr = GoimDiff()
        data.pr.update([])
    ttl = 'Wikipedia:Peer review'
    txt = getPage(ttl)
    R1 = re.compile(r'\{\{' + ttl + r'/([^\}]*)\}\}')
    lst = R1.findall(txt)
    print "Info: Found %d articles on PR" % len(lst)
    lst2 = [ x for x in lst if x in data.all.cur ]
    lst2.sort()
    res = data.pr.update(removeDuplicates(lst2))
    if not res == (0,0):
        print "Info: harvestPR() added %d, removed %d" % res


def harvestSPR():
    if not 'spr' in dir(data):
        data.spr = GoimDiff()
        data.spr.update([])
    ttl = 'Wikipedia:Scientific peer review/recent reviews'
    lst = getLinks(getPage(ttl))
    print "Info: Found %d articles on SPR" % len(lst)
    lst2 = [ x for x in lst if x in data.all.cur ]
    lst2.sort()
    res = data.spr.update(removeDuplicates(lst2))
    if not res == (0,0):
        print "Info: harvestSPR() added %d, removed %d" % res


def harvestMACR():
    if not 'macr' in dir(data):
        data.macr = GoimDiff()
    ttl = 'Wikipedia:WikiProject Mathematics/A-class rating'
    txt = getPage(ttl)
    R1 = re.compile(r'^\{\{' + ttl + r'/([^\}]*)\}\}', re.MULTILINE)
    lst = R1.findall(txt)
    print "Info: Found %d articles on MACR" % len(lst)
    lst.sort()
    res = data.macr.update(removeDuplicates(lst))
    if not res == (0,0):
        print "Info: harvestMACR() added %d, removed %d" % res


def featuredContent():
    goimTry(harvestFAC)
    goimTry(harvestFAR)
    goimTry(harvestGAC)
    goimTry(harvestGAR)
    goimTry(harvestPR)
    goimTry(harvestSPR)
    goimTry(harvestMACR)
    progressReport()
        
    
def mathArticles():
    if not 'all' in dir(data):
        data.all = GoimDiff()
    links = []
    links = getLinks(getPage('Lists of mathematics topics'))
    links += getLinks(getPage('List of mathematics articles (0-9)'))
    for i in range(ord('A'), ord('Z')+1):
        links += getLinks(getPage('List of mathematics articles ('+chr(i)+')'))

    lines = getPage('List of mathematicians').split('\n')
    lines = [ l for l in lines if l.lstrip().startswith('*') ]
    lines = [ l[:(l.find(']]')+2)] for l in lines if l.find(']]') != -1 ]
    links += getLinks('\n'.join(lines))

    try:
        for i in range(ord('A'), ord('Z')+1):
            lines = getPage('List of mathematicians ('+chr(i)+')')
            lines = lines.split('\n')
            lines = [ l for l in lines if l.lstrip().startswith('*') ]
            lines = [ l[:(l.find(']]')+2)] for l in lines if l.find(']]') != -1 ]
            links += getLinks('\n'.join(lines))
    except wikipedia.NoPage:
        print '[[List of mathematicians]] not yet split.'
    except:
        sys.excepthook(*sys.exc_info())
        print '\nGoIM: Ignoring above exception.\n'
    
    links.sort()
    res = data.all.update(removeDuplicates(links))
    print "Info: mathArticles() added %d, removed %d" % res
    progressReport()
    

def mathCategories():
    """Get all mathematical categories (including mathematicians)."""
    if not 'cat' in dir(data):
        data.cat = GoimDiff()
    txt = getPage('Wikipedia:WikiProject Mathematics/List of mathematics categories')
    txt = txt[0 : txt.index('==Mathematics-related categories==') ]
    lines = txt.split('\n')
    cats = []
    prefix = '[[:Category:'
    for l in lines:
        l = l.lstrip()
        if l.startswith(prefix):
            cats.append(l[len(prefix):l.index('|')])
    cats.sort()
    res = data.cat.update(removeDuplicates(cats))
    print "Info: mathCategories() added %d, removed %d" % res
    progressReport()

    
def articlesFromCategory(catname, recursive=False, subcatword=False):
    """Get all articles from a category, plus all articles
    corresponding to talk pages from the category. If recurse is True,
    then also go to all subcategories. If subcatword is set, then also
    go to all subcategories whose name starts with catname + ' ' +
    subcateword."""
    cat = catlib.Category(site, "Category:" + catname)
    arts = cat.articles(recursive)
    links = [ l.titleWithoutNamespace() for l in arts
              if l.namespace() == 0 or l.namespace() == 1 ]
    if subcatword:
        for c in cat.subcategories():
            if c.title().startswith("Category:" + catname + " " + subcatword):
                links += articlesFromCategory(c.title().replace("Category:", "", 1))
    return links


def reqImages():
    if not 'rqIm' in dir(data):
        data.rqIm = GoimDiff()
    links = articlesFromCategory('Wikipedia requested images', True)
    links = [ l for l in links if l in data.all.cur ]
    txt = getPage('Wikipedia:Requested pictures')
    txt = getSection(txt, 'Mathematics')
    txt = removeLines(txt, '==', '==')
    links += getLinks(txt)
    links.sort()
    res = data.rqIm.update(removeDuplicates(links))
    print "Info: reqImages() added %d, removed %d" % res
    progressReport()


def attention():
    if not 'att' in dir(data):
        data.att = GoimDiff()
    #Category no longer exists
    #links = articlesFromCategory('Pages needing attention')
    #links = [ l for l in links if l in data.all.cur ]
    links = []
    txt = getPage('Wikipedia:Pages needing attention/Mathematics')
    txt = removeLines(txt, '==', '==')
    txt = removeLines(txt, "''See", "")
    lines = [ l.lstrip().lstrip('*').lstrip()  for l in txt.split('\n') ]
    for l in lines:
        i = len(l)
        if l.find(':') != -1:
            i = min(i, l.find(':')) 
        if l.find('—') != -1:
            i = min(i, l.find('—')) 
        if l.find(' - ') != -1:
            i = min(i, l.find(' - '))
        links += getLinks(l[:i])
    res = data.att.update(links)
    print "Info: attention() added %d, removed %d" % res
    progressReport()


def requested():
    if not 'req' in dir(data):
        data.req = GoimDiff()
    txt = getPage('Wikipedia:Requested articles/Mathematics');
    txt = removeLines(txt, '==', '==')  # remove headings
    txt = removeLines(txt, "''See", "''") # remove comments
    txt = removeLines(txt, "(''", "'')") # remove more comments
    txt = removeLines(txt, ":''", "''") # remove more comments
    links = getLinks(txt)
    res = data.req.update(getLinks(txt))
    print "Info: requested() added %d, removed %d" % res
    progressReport()


def cleanupMisc0(id, catname, recurse=False, subcatword=False):
    """Put all maths articles from [[Category:catname]] to the id
    field of data. If recurse is True, then also go to all
    subcategories. If subcatword is set, then also go to all
    subcategories whose name starts with catname + ' ' +
    subcateword.""" 

    if not id in dir(data):
        setattr(data, id,  GoimDiff())
    links = articlesFromCategory(catname, recurse, subcatword)
    links = [ l for l in links if l in data.all.cur ]
    res = getattr(data, id).update(links)
    if not res == (0,0):
        print "Info: cleanupMisc0() added %d, removed %d, to/from %s" \
              % (res + (id,))

    
def cleanupMiscDay1():
    cleanupMisc0('con',   'Wikipedia articles needing context', True) 
    cleanupMisc0('exp',   'Pages needing expert attention', True) 
    cleanupMisc0('tech',  'Wikipedia articles that are too technical')
    cleanupMisc0('wfy',   'Articles that need to be wikified', True) 
    cleanupMisc0('merge', 'Articles to be merged', True)
    cleanupMisc0('disp',  'Accuracy disputes', subcatword='from')

    l = [ '1911 Britannica articles needing updates',
          # 'Aircraft without proper specifications',
          'Articles containing how-to sections',
          'Articles in need of internal merging',
          # unref = 'Articles lacking sources',
         ['Articles needing original script',True],
          'Articles needing sections',
          'Articles that are too long',
          'Articles that are way too long',
          # wfy = 'Articles that need to be wikified',
          # merge \subset 'Articles to be merged',
          # split = 'Articles to be split',
          'Articles to check for link ordering',
          'Articles to harmonize',
          'Articles using obsolete parameters',
          'Articles which may be unencyclopedic',
          # orig = 'Articles which may contain original research',
          'Articles with accessibility problems',
          'Articles with confusing statements',
          'Articles with incomplete statements',
          'Articles with peacock terms',
          'Articles with unsourced categories',
          'Articles with unsourced quotes',
          # unsrc = 'Articles with unsourced statements',
          'Articles with weasel words',
          'Articles without infobox',
          'Australia articles needing attention',
          'Biographical Directory of the United States Congress cleanup',
          'Biographies without real biographical information',
          'Biography articles needing attention',
          'Books needing cleanup',
          'CIA World Factbook cleanup',
          # 'Categories requiring diffusion',
          'Category needed',
          'Category needs checking',
          # 'Comics articles needing cleanup',
          # 'Comics needing cleanup',
          'Disambiguation pages in need of cleanup',
          # 'Firefly articles needing attention',
          # 'Guitarist articles needing attention',
          'History of Greece articles needing attention',
          # 'Images for cleanup',
          'India articles needing attention',
          # 'Invalid conservation status',
          # 'Law-related articles lacking sources',
          # merge \subset 'Merge by month',
          # 'Military history articles needing attention',
          'New Zealand cleanup',
          # 'Novel articles needing attention',
          # 'Novel articles with comments',
          # 'Nutrition & Dietetics articles requiring major expansion',
          # orph = 'Orphaned articles'
          # 'Orphaned categories',
          # 'Overpopulated stub categories',
          # att = 'Pages needing attention',
          'Pearle edits needing manual cleanup',
          'Philadelphia articles needing attention',
          # 'Places of local interest needing cleanup',
          # 'Portals needing attention',
          'Rough translations',
          # 'Schools needing cleanup',
          # 'Scouting articles needing attention',
          'Self-contradictory articles',
          # 'Spooks articles with comments',
          # 'Stub categories',
          # 'Tree of Life cleanup',
          # 'U.S. road articles needing work',
          'Uncategorised albums',
          'Uncategorised books',
          'Uncategorised films',
          # 'Very large categories',
          'Virginia articles needing attention',
          # 'WikiProject Comics cleanup',
          'Wikipedia articles containing buzzwords',
          'Wikipedia articles containing sections that are an unencyclopedically presented series of quotes',
          'Wikipedia articles in need of updating',
          # clar = 'Wikipedia articles needing clarification',
          # con = 'Wikipedia articles needing context',
          'Wikipedia articles needing copy edit',
          # vfy = 'Wikipedia articles needing factual verification',
          # rewr = 'Wikipedia articles needing rewrite',
          # tone = 'Wikipedia articles needing style editing',
          'Wikipedia articles needing their fiction made clear',
          'Wikipedia articles requiring OTRS cleanup',
          # tech = 'Wikipedia articles that are too technical',
          'Wikipedia articles using jargon',
          'Wikipedia articles with nonstandard pronunciation',
          'Wikipedia articles with off-topic sections',
          'Wikipedia articles with plot summary needing attention',
          # 'Wikipedia categories in need of attention',
          'Wikipedia cleanup after AFD',
          'Wikipedia external links cleanup',
          'Wikipedia infobox cleanup',
          # con = 'Wikipedia introduction cleanup',
          'Wikipedia laundry list cleanup',
          'Wikipedia list cleanup',
          # 'Wikipedia maintenance categories sorted by month',
          'Wikipedia references cleanup',
          # rqIm = 'Wikipedia requested images',
          'Wikipedia spam cleanup',
          'Wikipedia title cleanup',
          # ---
          'Articles which may be biased',
          'Articles with limited geographic scope',
          'Articles with obsolete information',
          'NPOV disputes',
          'Too Few Viewpoints'
        ]

    if not 'misc' in dir(data):
        data.misc = GoimDiff()
    pairs = []
    for x in l:
        if isinstance(x,list):
            catname = x[0]
            recursive = x[1]
        else:
            catname = x
            recursive = False
        try:
            links = articlesFromCategory(catname, recursive)
            links = [ (l,catname) for l in links if l in data.all.cur ]
            pairs.extend(links)
        except:
            sys.excepthook(*sys.exc_info())
            print ('\nGoIM: Ignoring above exception in cleanupMiscDay1()\n'
                   + '      Category = %s\n') % catname
    res = data.misc.update(pairs)
    if not res == (0,0):
        print "Info: cleanupMiscDay1() added %d, removed %d, to/from misc" % res

    print "Info: cleanupMiscDay1() finished"
    progressReport()


def unsourced():
    cleanupMisc0('unsrc', 'All articles with unsourced statements')
    progressReport()

    
def cleanupMiscDay2():
    cleanupMisc0('split', 'Articles to be split')
    cleanupMisc0('clar',  'Wikipedia articles needing clarification')
    cleanupMisc0('rewr',  'Wikipedia articles needing rewrite')
    cleanupMisc0('imp',   'Articles with topics of unclear notability', subcatword='from')
    cleanupMisc0('tone',  'Wikipedia articles needing style editing', subcatword='from')
    cleanupMisc0('orig',  'Articles that may contain original research', subcatword='since')
    cleanupMisc0('orph',  'Orphaned articles', subcatword='from')

    print "Info: cleanupMiscDay2() finished"
    progressReport()


def unref():
    if not 'unref' in dir(data):
        data.unref = GoimDiff()
    links = []
    catname = "Articles lacking sources"
    cat = catlib.Category(site, "Category:" + catname)
    for c in cat.subcategories():
        if c.title().startswith("Category:" + catname + " from"):
            links += articlesFromCategory(c.title().replace("Category:", "", 1))
    links += articlesFromCategory(catname)
    links = [ l for l in links if l in data.all.cur ]
    res = data.unref.update(links)
    print "Info: unref() added %d, removed %d" % res
    progressReport()

    
def norelref():
    if not 'norelref' in dir(data):
        data.norelref = GoimDiff()
    links = []
    catname = "Articles lacking reliable references"
    cat = catlib.Category(site, "Category:" + catname)
    for c in cat.subcategories():
        if c.title().startswith("Category:" + catname + " from"):
            links += articlesFromCategory(c.title().replace("Category:", "", 1))
    links += articlesFromCategory(catname)
    links = [ l for l in links if l in data.all.cur ]
    res = data.norelref.update(links)
    print "Info: norelref() added %d, removed %d" % res
    progressReport()

    
def morerefs():
    if not 'morefs' in dir(data):
        data.morefs = GoimDiff()
    links = []
    catname = "Articles needing additional references"
    cat = catlib.Category(site, "Category:" + catname)
    for c in cat.subcategories():
        if c.title().startswith("Category:" + catname + " from"):
            links += articlesFromCategory(c.title().replace("Category:", "", 1))
    links += articlesFromCategory(catname)
    links = [ l for l in links if l in data.all.cur ]
    res = data.morefs.update(links)
    print "Info: morefs() added %d, removed %d" % res
    progressReport()

    
def vfy():
    if not 'vfy' in dir(data):
        data.vfy = GoimDiff()
    links = []
    catname = "Wikipedia articles needing factual verification"
    cat = catlib.Category(site, "Category:" + catname)
    for c in cat.subcategories():
        if c.title().startswith("Category:" + catname + " since"):
            links += articlesFromCategory(c.title().replace("Category:", "", 1))
    links += articlesFromCategory(catname)
    links = [ l for l in links if l in data.all.cur ]
    res = data.vfy.update(links)
    print "Info: vfy() added %d, removed %d" % res
    progressReport()

    
def prop():
    cleanupMisc0('prop',  'All articles proposed for deletion')
    progressReport()


def cleanup():
    if not 'clean' in dir(data):
        data.clean = GoimDiff()
    links = []
    cat = catlib.Category(site, "Category:Cleanup by month")
    for c in cat.subcategories():
        if c.title().startswith("Category:Cleanup from"):
            links += articlesFromCategory(c.title().replace("Category:", "", 1))
    links += articlesFromCategory('All pages needing cleanup')
    links = [ l for l in links if l in data.all.cur ]
    res = data.clean.update(links)
    print "Info: cleanup() added %d, removed %d" % res
    progressReport()


def readData():
    """Read data from the data file, ignoring errors"""
    global data
    try:
        data = pickle.load(file('goim/data'))
    except:
        sys.excepthook(*sys.exc_info())
        print '\nGoIM: Ignoring above exception, starting with no data.\n'
        data = GoimData()
    

def transitionData():
    """Do whatever is necessary to update the data to the new format.
    Specifically:
    * merge data.stubs into data.all
    * remove data.cd
    """
    if 'stubs' in dir(data):
        l = data.all.cur + data.stubs.cur
        l.sort()
        l = removeDuplicates(l)
        print("Info: Merging data.stubs (%d) into data.all (%d), together (%d)"
              % (len(data.stubs.cur), len(data.all.cur), len(l)))
        data.all.cur = l
        del data.stubs
    if 'cd' in dir(data):
        print("Info: Removing data.cd")
        del data.cd
    if not 'orph' in dir(data):
        data.orph = GoimDiff()
        

def cleanupData():
    """Removed cruft from data. No-op"""
    pass


def writeData():
    """Write data to data file, cycling backups"""
    fn = 'goim/data';
    bn = lambda n : fn + '~%d~' % (n,)
    for n in range(7,1,-1):
        if os.access(bn(n-1), os.R_OK):
            os.rename(bn(n-1), bn(n))
    if os.access(fn, os.R_OK):
        os.rename(fn, bn(1))
    pickle.dump(data, file(fn, 'w'))


def dumpData(filename='goim/goim.dat'):
    """Dump data to file"""
    foo = lambda x: {'cur':x.cur, 'curtime':x.curtime,
                     'added':x.added, 'rmvd':x.rmvd, 'times':x.times}
    lst = ['req',   foo(data.req),    'att',   foo(data.att),
           'rqIm',  foo(data.rqIm),
           'all',   foo(data.all),    'cat',   foo(data.cat),
           'con',   foo(data.con), 
           'clean', foo(data.clean),  'exp',   foo(data.exp), 
           'tech',  foo(data.tech),   'vfy',   foo(data.vfy),
           'merge', foo(data.merge),  'rewr',  foo(data.rewr),
           'disp',  foo(data.disp),   'unsrc', foo(data.unsrc),
           'unref', foo(data.unref),  'norelref', foo(data.norelref),
           'morefs',foo(data.morefs), 'imp',   foo(data.imp),
           'split', foo(data.split),  'tone',  foo(data.tone),
           'clar',  foo(data.clar),   'orig',  foo(data.orig),
           'orph',  foo(data.orph),
           'wfy',   foo(data.wfy),    'prop',  foo(data.prop),
           'misc',  foo(data.misc),   'fac',   foo(data.fac),
           'far',   foo(data.far),    'farc',  foo(data.farc),
           'gac',   foo(data.gac),    'gar',   foo(data.gar),
           'pr',    foo(data.pr),     'spr',   foo(data.spr),
           'macr',  foo(data.macr), 
           'vfd',   data.vfd,         'cfd',   data.cfd ]
    txt = PrettyPrinter().pformat(lst)
    file(filename, 'w').write(txt.encode('utf-8'))


def writeRpim():
    cwd = os.getcwd()
    os.chdir('/home/jitse/public_html/wikipedia')
    file('rpim_number', 'w').write(str(len(data.all.cur)) + '\n')
    for i in range((len(data.all.cur)+99) / 100):
        txt = ''
        for j in range(i*100, min((i+1)*100, len(data.all.cur))):
            txt += wikipedia.Page(site,data.all.cur[j]).urlname() + '\n'
        f = file('rpim%03d' % (i,), 'w')
        f.write(txt)
        f.close()
    os.chdir(cwd)


def helpOleg():
    """Purge and do empty edits to LoMT (A-C) pages, at Oleg's request."""
    for s in ['A-C', 'D-F', 'G-I', 'J-L', 'M-O', 'P-R', 'S-U', 'V-Z']:
        p = wikipedia.Page(site, 'List of mathematics articles (' + s + ')')
        site.getUrl(site.purge_address(p.urlname())) # purge
        p.put(p.get(), '')              # empty edit
    print "Info: helpOleg() finished"
    progressReport()
    

def goimTry(fn):
    """Call fn() and print and ignore any exceptions raised during the call."""
    try:
        fn()
    except:
        sys.excepthook(*sys.exc_info())
        print '\nGoIM: Ignoring above exception.\n'
        

#  Main program

if __name__ == "__main__":

    k = 1
    while len(sys.argv) > k:
        if sys.argv[k] == 'noload':
            print 'Info: Noload mode enabled'
            noload = True
            k = k + 1
        elif sys.argv[k] == 'nowrite':
            print 'Info: Nowrite mode enabled'
            nowrite = True
            k = k + 1
        elif sys.argv[k] == 'dryrun':
            print 'Info: Dryrun mode enabled'
            noload = True
            nowrite = True
            k = k + 1
        elif sys.argv[k] == 'summary':
            subject = sys.argv[k+1]
            k = k + 2
        elif sys.argv[k] == 'log':
            if os.access('logs/goim', os.R_OK):
                os.rename('logs/goim', 'logs/goim.old')
	    wikipedia.setLogfileStatus(True, 'goim')
            import null_interface
            wikipedia.ui = null_interface.UI()
            k = k + 1
        elif sys.argv[k] == 'verbose':
            wikipedia.verbose += 1
            k = k + 1
        else:
            print 'FATAL ERROR: Error parsing arguments ' \
                  + ' '.join(sys.argv[1:])
            sys.exit(1)

    # # For testing purposes:
    # initProgressReport()
    # readData() 
    # goimTry(norelref)
    # goimTry(morerefs)
    # writeData()
    # nowrite = True
    # goimTry(updateLists)
    # wikipedia.stopme()
    # sys.exit(0)

    initProgressReport()
    readData()
    transitionData()
    if not noload:
        goimTry(mathArticles)
        goimTry(mathCategories)
        goimTry(requested)
        goimTry(attention)
        goimTry(listedOnVfd)
        goimTry(listedOnCfd)
        goimTry(prop)
        goimTry(featuredContent)
        day = datetime.datetime.now().toordinal()
	if day % 4 == 0:
            goimTry(reqImages)
        if day % 4 == 1:
            goimTry(vfy)
            goimTry(cleanupMiscDay1)
        if day % 4 == 2:
            goimTry(unsourced)
            goimTry(cleanupMiscDay2)
        if day % 4 == 3:
            goimTry(unref)
            goimTry(norelref)
            goimTry(morerefs)
            goimTry(cleanup)
                    
    goimTry(cleanupData)
    if not nowrite:
        writeData()

    goimTry(writeGoim)
    goimTry(updateLists)
        
    if not nowrite:
        goimTry(dumpData)
        goimTry(updatePortal)
        day = datetime.datetime.now().toordinal()
        if day % 4 == 3:
            goimTry(helpOleg)
        goimTry(updateScript)
        goimTry(writeRpim)
    else:
        dumpData('goim/goim.dat.test')

    wikipedia.stopme()