olivepy
utils
module
olivepy.utils.pem
Encapsulate pem file IO
Abstracts reading from and writing to PEM files and should provide common operations on PEM records such as duration(). The str() method of this class will result in a string that is legal PEM format. All operations are performed in memory so don't use any extraordinarily huge PEM files.
Each line in a PEM file has: filename, channel, class_label, start_t, end_t
Pem
get_maximum_duration(self)
Get duration of maxium duration record in PEM. Intended only for cases where PEM contains only one ID.
Source code in olivepy/utils/pem.py
def get_maximum_duration(self):
"""
Get duration of maxium duration record in PEM.
Intended only for cases where PEM contains only one ID.
"""
duration = 0
for id in list(self.__record_map.keys()):
for rec in self.__record_map[id]:
this_duration = float(rec.duration())
if (this_duration > duration): duration = this_duration
return duration
get_minimum_duration(self)
Get duration of minimum duration record in PEM. Intended only for cases where PEM contains only one ID.
Source code in olivepy/utils/pem.py
def get_minimum_duration(self):
"""
Get duration of minimum duration record in PEM.
Intended only for cases where PEM contains only one ID.
"""
duration = self.get_total_duration()
for id in list(self.__record_map.keys()):
for rec in self.__record_map[id]:
this_duration = float(rec.duration())
if (this_duration < duration): duration = this_duration
return duration
PemRecord
The underlying PEM container
__init__(self, id, channel, label, start_t, end_t, decimal=False)
special
Parameters:
Name | Type | Description | Default |
---|---|---|---|
id |
generally the filename |
required | |
channel |
the channel (if stereo). May be a string list (i.e. "1,2"). No validation is done for the channel value |
required | |
label |
a "class" label for this segment. Examples include speaker, language, speech, etc |
required | |
start_t |
the start time in seconds |
required | |
end_t |
the end time in seconds |
required | |
decimal |
if true, value is stored as a float |
False |
Source code in olivepy/utils/pem.py
def __init__(self, id, channel, label, start_t, end_t, decimal=False):
'''
:param id: generally the filename
:param channel: the channel (if stereo). May be a string list (i.e. "1,2"). No validation is done for the channel value
:param label: a "class" label for this segment. Examples include speaker, language, speech, etc
:param start_t: the start time in seconds
:param end_t: the end time in seconds
:param decimal: if true, value is stored as a float
'''
self.id = id
self.channel = channel
self.label = label
# Using Decimal instead of floats made the code 100x slower
# Use floats for now in the intrest of speed
if decimal:
self.start_t = start_t
self.end_t = end_t
else:
self.start_t = float(start_t)
self.end_t = float(end_t)
if(self.start_t > self.end_t):
raise Exception("Start is after end in PemRecord: {self}".format(**vars()))
split_channels(self)
Split the channel into an array, so that if a channel value of '1,2' is supplied it is returned as an array [1,2]
Returns:
Type | Description |
---|---|
an array of channel numbers |
Source code in olivepy/utils/pem.py
def split_channels(self):
'''
Split the channel into an array, so that if a channel value of '1,2' is supplied it is returned as an array [1,2]
:return: an array of channel numbers
'''
channels = []
if type(self.channel) is str:
# convert to a list
channels = list(map(int, str.split(self.channel, ',')))
elif type(self.channel) is int:
channels.append(self.channel)
else:
print("Unsupported channel value: {}".format(self.channel))
return channels
olivepy.utils.utils
parse_json_options(option_str)
Parse options from a json string. Intended to be used for workflow options that may be grouped by one or more tasks. Options can be passed in a couple of different structures. In the more complicated case they can be a list of dictionaries, that specify the task/job name these options are used for, for example: '[{"task":"SAD", "options":{"filter_length":99, "interpolate":1.0}}]' They can also be passed in a simple dictionary, like: '{"filter_length":99, "interpolate":1.0, "name":"midge"}'. In the former example, options are only passed to the job/task specified. In the latter case, these options are passed to all tasks. In both cases, OLIVE will only pass options to a task if the task supports that option name
Parameters:
Name | Type | Description | Default |
---|---|---|---|
option_str |
the options to parse |
required |
Source code in olivepy/utils/utils.py
def parse_json_options(option_str):
"""
Parse options from a json string. Intended to be used for workflow options that may be grouped by one or more
tasks. Options can be passed in a couple of different structures. In the more complicated case they can be a list
of dictionaries, that specify the task/job name these options are used for, for example: '[{"task":"SAD", "options":{"filter_length":99, "interpolate":1.0}}]'
They can also be passed in a simple dictionary, like: '{"filter_length":99, "interpolate":1.0, "name":"midge"}'.
In the former example, options are only passed to the job/task specified. In the latter case, these options are
passed to all tasks. In both cases, OLIVE will only pass options to a task if the task supports that option name
:param option_str: the options to parse
:return a list of OptionValue objects created from the JSON option input
"""
# Examples of inputs to handle:
# [{"task":"SAD", "options":{"filter_length":99, "interpolate":1.0}}]'
# '{"filter_length":99, "interpolate":1.0, "name":"midge"}'
# '[{"task":"LID", "options": {"filter_length":99, "interpolate":11.0, "test_name":"midge"}}]'
# '[{"job":"SAD LID Job", "task":"LID", "options": {"filter_length":99, "interpolate":11.0, "test_name":"midge"}}]'
# Parse options
json_opts = json.loads(option_str)
out_opts = []
# Options can be a list of task specific options
# currently we don't support task specific options so just create one dictionary of name/value options
if isinstance(json_opts, list):
for item in json_opts:
in_opts = item['options']
for opt in in_opts:
# print("\t{} = {}, value type: {}".format(opt, in_opts[opt], type(in_opts[opt])))
opt_msg = olivepb.OptionValue()
opt_msg.name = opt
opt_msg.value = str(in_opts[opt])
# optionally check if this option is restricted to a job/task:
if 'task' in item:
opt_msg.task_filter_name = item['task']
if 'job' in item:
opt_msg.job_filter_name = item['job']
out_opts.append(opt_msg)
else:
# or options that are applied to each task, which is just a simple dictionary
# like: {"filter_length":99, "interpolate":1.0}
# OLIVE wil internally ignore these options if the keyname does not match one of the option name
# a plugin supports for the requested trait (i.e. plugin.get_region_scoring_opts()
for opt in json_opts:
opt_msg = olivepb.OptionValue()
opt_msg.name = opt
opt_msg.value = str(json_opts[opt])
out_opts.append(opt_msg)
# print("\t{} = {}, value type: {}".format(opt, json_opts[opt], type(json_opts[opt])))
print("Final json options: {}".format(out_opts))
return out_opts
parse_json_options_as_dict(option_str)
Parse options from a json string. Intended to be used for workflow options that may be grouped by one or more tasks. Options can be passed in a couple of different structures. In the more complicated case they can be a list of dictionaries, that specify the task/job name these options are used for, for example: '[{"task":"SAD", "options":{"filter_length":99, "interpolate":1.0}}]' They can also be passed in a simple dictionary, like: '{"filter_length":99, "interpolate":1.0, "name":"midge"}'. In the former example, options are only passed to the job/task specified. In the latter case, these options are passed to all tasks. In both cases, OLIVE will only pass options to a task if the task supports that option name
Parameters:
Name | Type | Description | Default |
---|---|---|---|
option_str |
the options to parse |
required |
Returns:
Type | Description |
---|---|
a dictionary of options name/value pairs |
Source code in olivepy/utils/utils.py
def parse_json_options_as_dict(option_str):
"""
Parse options from a json string. Intended to be used for workflow options that may be grouped by one or more
tasks. Options can be passed in a couple of different structures. In the more complicated case they can be a list
of dictionaries, that specify the task/job name these options are used for, for example: '[{"task":"SAD", "options":{"filter_length":99, "interpolate":1.0}}]'
They can also be passed in a simple dictionary, like: '{"filter_length":99, "interpolate":1.0, "name":"midge"}'.
In the former example, options are only passed to the job/task specified. In the latter case, these options are
passed to all tasks. In both cases, OLIVE will only pass options to a task if the task supports that option name
:param option_str: the options to parse
:return: a dictionary of options name/value pairs
"""
# Parse options
json_opts = json.loads(option_str)
out_opts = dict()
# Options can be a list of task specific options
# currently we don't support task specific options so just create one dictionary of name/value options
if isinstance(json_opts, list):
for item in json_opts:
in_opts = item['options']
print("Found {} options for task: {}".format(len(in_opts), item['task']))
out_opts.update(in_opts)
for opt in in_opts:
print("\t{} = {}, value type: {}".format(opt, in_opts[opt], type(in_opts[opt])))
else:
# or options that are applied to each task, which is just a simple dictionary
# like: {"filter_length":99, "interpolate":1.0}
# OLIVE wil internally ignore these options if the keyname does not match one of the option name
# a plugin supports for the requested trait (i.e. plugin.get_region_scoring_opts()
out_opts = json_opts
for opt in json_opts:
print("\t{} = {}, value type: {}".format(opt, json_opts[opt], type(json_opts[opt])))
print("Final json options: {}".format(out_opts))
return out_opts
parse_pem_file(data_lines)
Parse a PEM file, grouping the results by audio file and channel.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data_lines |
the data line to parse |
required |
Returns:
Type | Description |
---|---|
a dictionary of audio files to score and the channel region: # {'filename': {channel: {class_id : [(start_region, end_region, class_id)]} } } |
Source code in olivepy/utils/utils.py
def parse_pem_file(data_lines):
"""
Parse a PEM file, grouping the results by audio file and channel.
:param data_lines: the data line to parse
:return: a dictionary of audio files to score and the channel region:
# {'filename': {channel: {class_id : [(start_region, end_region, class_id)]} } }
"""
# We process by file and channel - the class/label is ignored
regions = {}
input_pem = Pem()
input_pem.add_records_from_data_lines(data_lines)
for id in input_pem.get_ids():
audio_id = os.path.expandvars(id)
# Create a dictionary of the regions specified for the the current file
regions[audio_id] = {}
for rec in input_pem.get_records(id):
# channel could be a list...
channels = []
if type(rec.channel) is str:
# convert to a list
channels = map(int, str.split(rec.channel, ','))
elif type(rec.channel) is int:
channels.append(rec.channel)
else:
print("Unsupported channel value: {}".format(rec.channel))
for ch in channels:
if ch not in regions[audio_id]:
regions[audio_id][ch] = {}
class_id = rec.label
if class_id not in regions[audio_id][ch]:
regions[audio_id][ch][class_id] = []
regions[audio_id][ch][class_id].append((rec.start_t, rec.end_t))
return regions