import ROOT
from fnmatch import fnmatch
from mkShapesRDF.processor.framework.module import Module
import sys
import subprocess
[docs]class Snapshot(Module):
"""
The Snapshot module handles the creation of the final output files.
There are 4 possible scenarios:
- snapshot nominals only
- snapshot variations only in separate files (one file per variation)
- snapshot nominals and variation in separate files (one file per variation and one file for nominal)
- snapshot both nominals and variations in the same file
The module does not run the snapshotting itself, but it creates the Snapshot objects with lazy evaluation.
The snapshots to be run are added to the ``values`` variable:
.. code-block:: python
values.append(
[
"snapshot",
SnapshotObject,
[tmpOutputFilename, copyFromInputFiles , outputFolderPath, outputFilenameEOS]
])
Where
- ``tmpOutputFilename`` is the name of the temporary output file,
- ``copyFromInputFiles`` is a boolean that indicates if the auxiliary keys of the input files should be copied in the output (only done for nominals),
- ``outputFolderPath`` is the path of the output folder
- ``outputFilenameEOS`` is the name of the output file in the EOS folder (final output file name)
"""
[docs] def __init__(
self,
tmpOutputFilename,
columns,
eosPath,
outputFilename,
includeVariations=True,
splitVariations=True,
storeNominals=True,
):
super().__init__("Snapshot")
self.tmpOutputFilename = tmpOutputFilename
self.columns = columns
self.eosPath = eosPath
self.outputFilename = outputFilename
self.includeVariations = includeVariations
self.splitVariations = splitVariations
self.storeNominals = storeNominals
[docs] def SplitVariations(self, df):
"""
Create a Snapshot object for each variation and tag.
Parameters
----------
df : mRDF
The ``mRDF`` object to use for snapshot
"""
# create separate files for each variation and tag
for variationName in df.variations:
for tag in df.variations[variationName]["tags"]:
tmp_varied_cols = df.GetVariedColumns_oneVariation(
self.saveColumns, variationName, tag
)
outputFilename = self.tmpOutputFilename.split(".")
outputFilename[-2] += "__" + variationName + "_" + tag
outputFilename = ".".join(outputFilename)
self.snapshots.append(
[
variationName + "_" + tag,
tmp_varied_cols,
outputFilename,
False,
self.eosPath + "__" + variationName + "_" + tag,
self.outputFilename,
]
)
[docs] def StoreNominals(self, df):
"""
Create a Snapshot object for the nominal columns.
Parameters
----------
df : mRDF
The ``mRDF`` object to use for snapshot
"""
# store only nominals and remove variations from the list
tmp_varied_cols = df.GetVariedColumns(self.saveColumns)
saveColumns = list(set(self.saveColumns).difference(set(tmp_varied_cols)))
self.snapshots.append(
[
"nominal",
saveColumns,
self.tmpOutputFilename,
True,
self.eosPath,
self.outputFilename,
]
)
[docs] def StoreNominalsAndVariations(self, df):
"""
Create a Snapshot object containing both nominals and variations.
Parameters
----------
df : mRDF
The ``mRDF`` object to use for snapshot
"""
# store variation in the same file as nominal
tmp_varied_cols = df.GetVariedColumns(self.saveColumns)
self.snapshots.append(
[
"nominal and varied",
self.saveColumns + tmp_varied_cols,
self.tmpOutputFilename,
True,
self.eosPath,
self.outputFilename,
]
)
[docs] def runModule(self, df, values):
print("Snapshotting to", self.outputFilename, "columns:\n", self.columns)
cols = df.GetColumnNames()
saveColumns = []
for pattern in self.columns:
tmp_cols = list(set(filter(lambda k: fnmatch(k, pattern), cols)))
if len(tmp_cols) == 0:
print("Warning: no columns found to snapshot with pattern", pattern)
saveColumns = list(set(saveColumns).union(set(tmp_cols)))
self.saveColumns = list(set(saveColumns))
self.snapshots = []
if self.includeVariations and self.splitVariations and self.storeNominals:
self.SplitVariations(df)
self.StoreNominals(df)
if self.includeVariations and self.splitVariations and not self.storeNominals:
self.SplitVariations(df)
if (
self.includeVariations
and not self.splitVariations
and not self.storeNominals
):
print(
"Cannot snapshot variations all together without storing nominal\nEither set storeNominals=True or splitVariations=True",
file=sys.stderr,
)
sys.exit(1)
if (
not self.includeVariations
and not self.splitVariations
and not self.storeNominals
):
print(
"Snapshot without variations and without storing nominal is not supported",
file=sys.stderr,
)
sys.exit(1)
if (
not self.includeVariations
and self.splitVariations
and not self.storeNominals
):
print(
"Snapshot without variations and with split variations is not supported\nTurn on includeVariations",
file=sys.stderr,
)
sys.exit(1)
if not self.includeVariations and self.splitVariations and self.storeNominals:
print(
"Snapshot without variations and with split variations is not supported\nTurn on includeVariations",
file=sys.stderr,
)
sys.exit(1)
if (
not self.includeVariations
and not self.splitVariations
and self.storeNominals
):
self.StoreNominals(df)
if self.includeVariations and not self.splitVariations and self.storeNominals:
self.StoreNominalsAndVariations(df)
for snapshot in self.snapshots:
_cols = sorted(list(set(snapshot[1])))
print("Final list of variables for snapshot", snapshot[0], _cols)
opts = ROOT.RDF.RSnapshotOptions()
opts.fLazy = True
opts.fMode = "UPDATE"
opts.fOverwriteIfExists = True
opts.fCompressionAlgorithm = ROOT.ROOT.kLZMA
opts.fCompressionLevel = 9
values.append(
[
"snapshot",
df.Snapshot("Events", snapshot[2], _cols, opts),
snapshot[2:],
]
)
return df