#!/usr/bin/env python

import sys, os, re, glob
sys.path.append(os.path.expanduser("~markd/compbio/code/pycbio/lib"))
from pycbio.tsv.TSVReader import TSVReader
from pycbio.sys import fileOps,procOps

tblStatsDir="/cluster/data/genbank/var/tblstats/hgwbeta/2008.01.30"
tblReFile="/cluster/data/genbank/etc/genbank.tbls"
uniqStrReFile="/cluster/data/genbank/etc/uniqstr.tbls"

# doens't unclude uniqStr
metaDataTbls = ("^gbCdnaInfo$", "^imageClone$", "^refSeqStatus$", "^refLink$",
                "^refSeqSummary$", "^gbSeq$", "^gbExtFile$", "^gbStatus$", "^gbLoaded$",
                "^gbMiscDiff$")


def listToRe(lst):
    return re.compile("(" + ")|(".join(lst) + ")")

def loadReFile(fname):
    return re.compile("(" + ")|(".join(fileOps.readFileLines(fname)) + ")")

def toGb(bytes):
    return float(bytes)/(1024.0*1024.0*1024.0)

def toGbFmt(bytes):
    return ("%0.2fG" % toGb(bytes))

def toGbPer(bytes, total):
    return ("%0.2f%%" % (100.0*(toGb(bytes)/toGb(total))))

def hasGbLoaded(tblFile):
    try:
        procOps.runProc(["egrep", "-q", "^gbLoaded\t", tblFile])
    except procOps.ProcException, e:
        if e.returncode == 1:
            return False
    return True

class GenbankTbl(object):
    __slots__ = ("name", "dataLen", "indexLen", "isPerChrom", "isUniqStr", "isMetaData")
    uniqStrRe = loadReFile(uniqStrReFile)
    metaDataRe = listToRe(metaDataTbls)
    
    def __init__(self, name, dataLen, indexLen):
        self.name = name
        self.dataLen = dataLen
        self.indexLen = indexLen
        self.isPerChrom = name.startswith("chr")
        self.isUniqStr = (self.uniqStrRe.match(name) != None)
        self.isMetaData = (self.metaDataRe.match(name) != None)

    def __str__(self):
        return "\t".join([self.name, str(self.dataLen), str(self.indexLen)])

class GenbankTbls(list):
    gbTblRe = loadReFile(tblReFile)
    def __init__(self, tblFile):
        self.db = os.path.splitext(os.path.basename(tblFile))[0]
        typeMap = {"Data_length": int, "Index_length": int}
        for row in TSVReader(tblFile, typeMap=typeMap):
            if self.gbTblRe.match(row.Name):
                tbl = GenbankTbl(row.Name, row.Data_length, row.Index_length)
                self.append(tbl)

    def totalSize(self):
        sz = 0
        for tbl in self:
            sz += tbl.dataLen + tbl.indexLen
        return sz

    def totalPerChromSize(self):
        sz = 0
        for tbl in self:
            if tbl.isPerChrom:
                sz += tbl.dataLen+tbl.indexLen
        return sz

    def totalUniqStrSize(self):
        sz = 0
        for tbl in self:
            if tbl.isUniqStr:
                sz += tbl.dataLen+tbl.indexLen
        return sz

    def totalMetaDataSize(self):
        sz = 0
        for tbl in self:
            if tbl.isMetaData:
                sz += tbl.dataLen+tbl.indexLen
        return sz

    def dump(self, fh):
        for tbl in self:
            fileOps.prRowv(fh, self.db, str(tbl))
            
class GenbankOverhead(object):
    def __init__(self):
        self.dbTblMap = {}
        self.dbTbls = []

        for tblFile in glob.glob(tblStatsDir + "/*.tbls"):
            if hasGbLoaded(tblFile):
                tblInfo = GenbankTbls(tblFile)
                self.dbTblMap[tblInfo.db] = tblInfo
                self.dbTbls.append(tblInfo)

    def totalSize(self):
        sz = 0
        for dbTbls in self.dbTbls:
            sz += dbTbls.totalSize()
        return sz

    def totalPerChromSize(self):
        sz = 0
        for dbTbls in self.dbTbls:
            sz += dbTbls.totalPerChromSize()
        return sz

    def totalUniqStrSize(self):
        sz = 0
        for dbTbls in self.dbTbls:
            sz += dbTbls.totalUniqStrSize()
        return sz

    def totalMetaDataSize(self):
        sz = 0
        for dbTbls in self.dbTbls:
            sz += dbTbls.totalMetaDataSize()
        return sz

    def dump(self, fh):
        for tbls in self.dbTbls:
            tbls.dump(fh)

    def report(self, fh):
        size = self.totalSize()
        perChromSize = self.totalPerChromSize()
        uniqStrSize = self.totalUniqStrSize()
        metaSize = self.totalMetaDataSize()
        sharedUniqStr = self.dbTblMap["hg18"].totalUniqStrSize()
        sharedMetaData = self.dbTblMap["hg18"].totalMetaDataSize()

        fileOps.prRowv(fh, "assemblies", len(self.dbTbls))
        fileOps.prRowv(fh, "totalSize", toGbFmt(size))
        fileOps.prRowv(fh, "perChrom", toGbFmt(perChromSize), toGbPer(perChromSize, size))
        fileOps.prRowv(fh, "uniqStrSize", toGbFmt(uniqStrSize), toGbPer(uniqStrSize, size))
        fileOps.prRowv(fh, "otherMetaSize", toGbFmt(metaSize), toGbPer(metaSize, size))
        fileOps.prRowv(fh)
        fileOps.prRowv(fh, "sharedUniqStr", toGbFmt(sharedUniqStr))
        fileOps.prRowv(fh, "sharedMetaData", toGbFmt(sharedMetaData))
        fileOps.prRowv(fh)

        withSharedUniq = size - uniqStrSize
        withSharedUniqNoPerChrom = size - (uniqStrSize+perChromSize)
        withSharedAllMeta = size - (metaSize+uniqStrSize)

        fileOps.prRowv(fh, "withSharedUniq", toGbFmt(withSharedUniq), toGbPer(withSharedUniq, size))
        fileOps.prRowv(fh, "withSharedUniqNoPerChrom", toGbFmt(withSharedUniqNoPerChrom), toGbPer(withSharedUniqNoPerChrom, size))
        fileOps.prRowv(fh, "withSharedAllMeta", toGbFmt(withSharedAllMeta), toGbPer(withSharedAllMeta, size))

gbo = GenbankOverhead()
gbo.report(sys.stdout)
        

