Source code for samples

"""
Defines the samples and the list of files together with the weights to use for them.

Examples
--------

>>> from mkShapesRDF.lib.search_files import SearchFiles
>>> searchFiles = SearchFiles()
>>> redirector = ""


>>> mcProduction = "Summer16_102X_nAODv7_Full2016v7"
>>> dataReco = "Run2016_102X_nAODv7_Full2016v7"
>>> mcSteps = "MCl1loose2016v7__MCCorr2016v7__l2loose__l2tightOR2016v7"
>>> fakeSteps = "DATAl1loose2016v7__l2loose__fakeW"
>>> dataSteps = "DATAl1loose2016v7__l2loose__l2tightOR2016v7"
>>> ##############################################
>>> ###### Tree base directory for the site ######
>>> ##############################################
>>> treeBaseDir = "/eos/cms/store/group/phys_higgs/cmshww/amassiro/HWWNano"
>>> limitFiles = -1
>>> def makeMCDirectory(var=""):
>>>     _treeBaseDir = treeBaseDir + ""
>>>     if redirector != "":
>>>         _treeBaseDir = redirector + treeBaseDir
>>>     if var == "":
>>>         return "/".join([_treeBaseDir, mcProduction, mcSteps])
>>>     else:
>>>         return "/".join([_treeBaseDir, mcProduction, mcSteps + "__" + var])
>>>
>>>
>>> mcDirectory = makeMCDirectory()
>>> fakeDirectory = os.path.join(treeBaseDir, dataReco, fakeSteps)
>>> dataDirectory = os.path.join(treeBaseDir, dataReco, dataSteps)



>>> samples = {}
>>> DataRun = [
>>>     ["B", "Run2016B-02Apr2020_ver2-v1"],
>>>     ["C", "Run2016C-02Apr2020-v1"],
>>>     ["D", "Run2016D-02Apr2020-v1"],
>>>     ["E", "Run2016E-02Apr2020-v1"],
>>>     ["F", "Run2016F-02Apr2020-v1"],
>>>     ["G", "Run2016G-02Apr2020-v1"],
>>>     ["H", "Run2016H-02Apr2020-v1"],
>>> ]
>>> DataSets = ["MuonEG", "SingleMuon", "SingleElectron", "DoubleMuon", "DoubleEG"]
>>> DataTrig = {
>>>     "MuonEG": " Trigger_ElMu",
>>>     "SingleMuon": "!Trigger_ElMu && Trigger_sngMu",
>>>     "SingleElectron": "!Trigger_ElMu && !Trigger_sngMu && Trigger_sngEl",
>>>     "DoubleMuon": "!Trigger_ElMu && !Trigger_sngMu && !Trigger_sngEl && Trigger_dblMu",
>>>     "DoubleEG": "!Trigger_ElMu && !Trigger_sngMu && !Trigger_sngEl && !Trigger_dblMu && Trigger_dblEl",
>>> }


>>> mcCommonWeightNoMatch = "XSWeight*SFweight*METFilter_MC"
>>> mcCommonWeight = "XSWeight*SFweight*PromptGenLepMatch2l*METFilter_MC"
>>> 
>>> 
>>> ###### Zjj EWK #######
>>> 
>>> files = nanoGetSampleFiles(mcDirectory, "EWK_LLJJ_MLL-50_MJJ-120")
>>> 
>>> samples["Zjj"] = {
>>>     "name": files,
>>>     "weight": mcCommonWeight,
>>>     "FilesPerJob": 1,
>>> }
>>> 
>>> 
>>> ###### DY MC ######
>>> dys = {
>>>     "DY_hardJets": "hardJets",
>>>     "DY_PUJets": "PUJets",
>>> }
>>> 
>>> files = nanoGetSampleFiles(mcDirectory, "DYJetsToLL_M-50_ext2")
>>> 
>>> samples["DY"] = {
>>>     "name": files,
>>>     "weight": mcCommonWeight
>>>     + "*( !(Sum(PhotonGen_isPrompt==1 && PhotonGen_pt>15 && abs(PhotonGen_eta)<2.6) > 0)) * ewknloW",
>>>     "FilesPerJob": 5,
>>>     "subsamples": dys,
>>> }
>>> 
>>> 
>>> ###########################################
>>> ################## DATA ###################
>>> ###########################################
>>> 
>>> samples["DATA"] = {
>>>     "name": [],
>>>     "weight": "METFilter_DATA*LepWPCut",
>>>     "weights": [],
>>>     "isData": ["all"],
>>>     "FilesPerJob": 50,
>>> }
>>> 
>>> for _, sd in DataRun:
>>>     for pd in DataSets:
>>>         files = nanoGetSampleFiles(dataDirectory, pd + "_" + sd)
>>> 
>>>         samples["DATA"]["name"].extend(files)
>>>         addSampleWeight(samples, "DATA", pd + "_" + sd, DataTrig[pd])
>>>         # samples['DATA']['weights'].extend([DataTrig[pd]] * len(files))
"""
# flake8: noqa E266
from mkShapesRDF.lib.search_files import SearchFiles
import os



[docs]def nanoGetSampleFiles(path, name): """ Retrieve files given path and name Parameters ---------- path : str path to folder where to look for files name : str name of the file to look for Returns ------- `list of tuple` list of tuples in the form of ``(name, list of files)`` Notes ----- This function uses SearchFiles (the object ``searchFile``) to retrieve the files and the Latino naming convention is assumed. The ``redirector`` defined above is also used. """ _files = searchFiles.searchFiles(path, name, redirector=redirector) if limitFiles != -1 and len(_files) > limitFiles: return [(name, _files[:limitFiles])] else: return [(name, _files)]
[docs]def CombineBaseW(samples, proc, samplelist): """ Combine baseW for a given process. If two samples (different names) enter the same phase space the new baseW will consider the XS and the sum of all ``genEventSumw`` across all the files. Parameters ---------- samples : dict dictionary of samples proc : str the samples key for the process samplelist : `list of str` list of sample name inside ``samples[proc]`` to combine Notes ----- Will call ``addSampleWeight`` for each sample in ``samplelist``. """ _filtFiles = list(filter(lambda k: k[0] in samplelist, samples[proc]["name"])) _files = list(map(lambda k: k[1], _filtFiles)) _l = list(map(lambda k: len(k), _files)) leastFiles = _files[_l.index(min(_l))] dfSmall = ROOT.RDataFrame("Runs", leastFiles) s = dfSmall.Sum("genEventSumw").GetValue() f = ROOT.TFile(leastFiles[0]) t = f.Get("Events") t.GetEntry(1) xs = t.baseW * s __files = [] for f in _files: __files += f df = ROOT.RDataFrame("Runs", __files) s = df.Sum("genEventSumw").GetValue() newbaseW = str(xs / s) weight = newbaseW + "/baseW" for iSample in samplelist: addSampleWeight(samples, proc, iSample, weight)
[docs]def addSampleWeight(samples, sampleName, sampleNameType, weight): """ Add weight to a sample Parameters ---------- samples : dict dictionary of samples sampleName : str the samples key for the process sampleNameType : str the sample name inside ``samples[proc]`` to add the weight to weight : str the weight to add """ obj = list(filter(lambda k: k[0] == sampleNameType, samples[sampleName]["name"]))[0] samples[sampleName]["name"] = list( filter(lambda k: k[0] != sampleNameType, samples[sampleName]["name"]) ) if len(obj) > 2: samples[sampleName]["name"].append( (obj[0], obj[1], obj[2] + "*(" + weight + ")") ) else: samples[sampleName]["name"].append((obj[0], obj[1], "(" + weight + ")"))