Source code for samples

"""
Defines the samples and the list of files together with the weights to use for them.

Examples
--------

>>> from mkShapesRDF.lib.search_files import SearchFiles
>>> searchFiles = SearchFiles()
>>> redirector = ""


>>> mcProduction = "Summer16_102X_nAODv7_Full2016v7"
>>> dataReco = "Run2016_102X_nAODv7_Full2016v7"
>>> mcSteps = "MCl1loose2016v7__MCCorr2016v7__l2loose__l2tightOR2016v7"
>>> fakeSteps = "DATAl1loose2016v7__l2loose__fakeW"
>>> dataSteps = "DATAl1loose2016v7__l2loose__l2tightOR2016v7"
>>> ##############################################
>>> ###### Tree base directory for the site ######
>>> ##############################################
>>> treeBaseDir = "/eos/cms/store/group/phys_higgs/cmshww/amassiro/HWWNano"
>>> limitFiles = -1
>>> def makeMCDirectory(var=""):
>>>     _treeBaseDir = treeBaseDir + ""
>>>     if redirector != "":
>>>         _treeBaseDir = redirector + treeBaseDir
>>>     if var == "":
>>>         return "/".join([_treeBaseDir, mcProduction, mcSteps])
>>>     else:
>>>         return "/".join([_treeBaseDir, mcProduction, mcSteps + "__" + var])
>>>
>>>
>>> mcDirectory = makeMCDirectory()
>>> fakeDirectory = os.path.join(treeBaseDir, dataReco, fakeSteps)
>>> dataDirectory = os.path.join(treeBaseDir, dataReco, dataSteps)



>>> samples = {}
>>> DataRun = [
>>>     ["B", "Run2016B-02Apr2020_ver2-v1"],
>>>     ["C", "Run2016C-02Apr2020-v1"],
>>>     ["D", "Run2016D-02Apr2020-v1"],
>>>     ["E", "Run2016E-02Apr2020-v1"],
>>>     ["F", "Run2016F-02Apr2020-v1"],
>>>     ["G", "Run2016G-02Apr2020-v1"],
>>>     ["H", "Run2016H-02Apr2020-v1"],
>>> ]
>>> DataSets = ["MuonEG", "SingleMuon", "SingleElectron", "DoubleMuon", "DoubleEG"]
>>> DataTrig = {
>>>     "MuonEG": " Trigger_ElMu",
>>>     "SingleMuon": "!Trigger_ElMu && Trigger_sngMu",
>>>     "SingleElectron": "!Trigger_ElMu && !Trigger_sngMu && Trigger_sngEl",
>>>     "DoubleMuon": "!Trigger_ElMu && !Trigger_sngMu && !Trigger_sngEl && Trigger_dblMu",
>>>     "DoubleEG": "!Trigger_ElMu && !Trigger_sngMu && !Trigger_sngEl && !Trigger_dblMu && Trigger_dblEl",
>>> }


>>> mcCommonWeightNoMatch = "XSWeight*SFweight*METFilter_MC"
>>> mcCommonWeight = "XSWeight*SFweight*PromptGenLepMatch2l*METFilter_MC"
>>> 
>>> 
>>> ###### Zjj EWK #######
>>> 
>>> files = nanoGetSampleFiles(mcDirectory, "EWK_LLJJ_MLL-50_MJJ-120")
>>> 
>>> samples["Zjj"] = {
>>>     "name": files,
>>>     "weight": mcCommonWeight,
>>>     "FilesPerJob": 1,
>>> }
>>> 
>>> 
>>> ###### DY MC ######
>>> dys = {
>>>     "DY_hardJets": "hardJets",
>>>     "DY_PUJets": "PUJets",
>>> }
>>> 
>>> files = nanoGetSampleFiles(mcDirectory, "DYJetsToLL_M-50_ext2")
>>> 
>>> samples["DY"] = {
>>>     "name": files,
>>>     "weight": mcCommonWeight
>>>     + "*( !(Sum(PhotonGen_isPrompt==1 && PhotonGen_pt>15 && abs(PhotonGen_eta)<2.6) > 0)) * ewknloW",
>>>     "FilesPerJob": 5,
>>>     "subsamples": dys,
>>> }
>>> 
>>> 
>>> ###########################################
>>> ################## DATA ###################
>>> ###########################################
>>> 
>>> samples["DATA"] = {
>>>     "name": [],
>>>     "weight": "METFilter_DATA*LepWPCut",
>>>     "weights": [],
>>>     "isData": ["all"],
>>>     "FilesPerJob": 50,
>>> }
>>> 
>>> for _, sd in DataRun:
>>>     for pd in DataSets:
>>>         files = nanoGetSampleFiles(dataDirectory, pd + "_" + sd)
>>> 
>>>         samples["DATA"]["name"].extend(files)
>>>         addSampleWeight(samples, "DATA", pd + "_" + sd, DataTrig[pd])
>>>         # samples['DATA']['weights'].extend([DataTrig[pd]] * len(files))
"""
# flake8: noqa E266
from mkShapesRDF.lib.search_files import SearchFiles
import os



[docs]def nanoGetSampleFiles(path, name):
    """
    Retrieve files given path and name

    Parameters
    ----------
    path : str
        path to folder where to look for files
    name : str
        name of the file to look for

    Returns
    -------
    `list of tuple`
        list of tuples in the form of ``(name, list of files)``

    Notes
    -----
    This function uses SearchFiles (the object ``searchFile``) to retrieve the files and the Latino naming convention is assumed.
    The ``redirector`` defined above is also used.

    """
    _files = searchFiles.searchFiles(path, name, redirector=redirector)
    if limitFiles != -1 and len(_files) > limitFiles:
        return [(name, _files[:limitFiles])]
    else:
        return [(name, _files)]


[docs]def CombineBaseW(samples, proc, samplelist):
    """
    Combine baseW for a given process. 
    
    If two samples (different names) enter the same phase space the new baseW will consider 
    the XS and the sum of all ``genEventSumw`` across all the files.

    Parameters
    ----------
    samples : dict
        dictionary of samples
    proc : str
        the samples key for the process
    samplelist : `list of str`
        list of sample name inside ``samples[proc]`` to combine

    Notes
    -----
    Will call ``addSampleWeight`` for each sample in ``samplelist``.

    """
    _filtFiles = list(filter(lambda k: k[0] in samplelist, samples[proc]["name"]))
    _files = list(map(lambda k: k[1], _filtFiles))
    _l = list(map(lambda k: len(k), _files))
    leastFiles = _files[_l.index(min(_l))]
    dfSmall = ROOT.RDataFrame("Runs", leastFiles)
    s = dfSmall.Sum("genEventSumw").GetValue()
    f = ROOT.TFile(leastFiles[0])
    t = f.Get("Events")
    t.GetEntry(1)
    xs = t.baseW * s

    __files = []
    for f in _files:
        __files += f
    df = ROOT.RDataFrame("Runs", __files)
    s = df.Sum("genEventSumw").GetValue()
    newbaseW = str(xs / s)
    weight = newbaseW + "/baseW"

    for iSample in samplelist:
        addSampleWeight(samples, proc, iSample, weight)


[docs]def addSampleWeight(samples, sampleName, sampleNameType, weight):
    """ 
    Add weight to a sample

    Parameters
    ----------
    samples : dict
        dictionary of samples
    sampleName : str
        the samples key for the process
    sampleNameType : str
        the sample name inside ``samples[proc]`` to add the weight to
    weight : str
        the weight to add


    """
    obj = list(filter(lambda k: k[0] == sampleNameType, samples[sampleName]["name"]))[0]
    samples[sampleName]["name"] = list(
        filter(lambda k: k[0] != sampleNameType, samples[sampleName]["name"])
    )
    if len(obj) > 2:
        samples[sampleName]["name"].append(
            (obj[0], obj[1], obj[2] + "*(" + weight + ")")
        )
    else:
        samples[sampleName]["name"].append((obj[0], obj[1], "(" + weight + ")"))