avs-device-sdk/CapabilityAgents/AIP/src/AudioInputProcessor.cpp

2018 lines
87 KiB
C++

/*
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License").
* You may not use this file except in compliance with the License.
* A copy of the License is located at
*
* http://aws.amazon.com/apache2.0/
*
* or in the "license" file accompanying this file. This file is distributed
* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <algorithm>
#include <cctype>
#include <functional>
#include <list>
#include <sstream>
#include <AVSCommon/AVS/CapabilityConfiguration.h>
#include <AVSCommon/AVS/FocusState.h>
#include <AVSCommon/AVS/MessageRequest.h>
#include <AVSCommon/Utils/JSON/JSONGenerator.h>
#include <AVSCommon/Utils/JSON/JSONUtils.h>
#include <AVSCommon/Utils/Logger/Logger.h>
#include <AVSCommon/Utils/Memory/Memory.h>
#include <AVSCommon/Utils/Metrics.h>
#include <AVSCommon/Utils/Metrics/DataPointDurationBuilder.h>
#include <AVSCommon/Utils/Metrics/DataPointCounterBuilder.h>
#include <AVSCommon/Utils/Metrics/DataPointStringBuilder.h>
#include <AVSCommon/Utils/Metrics/MetricEventBuilder.h>
#include <AVSCommon/Utils/String/StringUtils.h>
#include <AVSCommon/Utils/UUIDGeneration/UUIDGeneration.h>
#include <AVSCommon/AVS/Attachment/AttachmentUtils.h>
#include <Settings/SettingEventMetadata.h>
#include <Settings/SharedAVSSettingProtocol.h>
#include <SpeechEncoder/SpeechEncoder.h>
#include <AVSCommon/AVS/Attachment/DefaultAttachmentReader.h>
#include "AIP/AudioInputProcessor.h"
namespace alexaClientSDK {
namespace capabilityAgents {
namespace aip {
using namespace avsCommon::avs;
using namespace avsCommon::utils;
using namespace avsCommon::utils::logger;
using namespace avsCommon::utils::metrics;
using namespace avsCommon::utils::json;
using namespace avsCommon::sdkInterfaces;
using namespace std::chrono;
/// SpeechRecognizer capability constants
/// SpeechRecognizer interface type
static const std::string SPEECHRECOGNIZER_CAPABILITY_INTERFACE_TYPE = "AlexaInterface";
/// SpeechRecognizer interface name
static const std::string SPEECHRECOGNIZER_CAPABILITY_INTERFACE_NAME = "SpeechRecognizer";
/// SpeechRecognizer interface version
#ifdef OVERRIDE_SPEECHRECOGNIZER_VERSION
static const std::string SPEECHRECOGNIZER_CAPABILITY_INTERFACE_VERSION = OVERRIDE_SPEECHRECOGNIZER_VERSION;
#else
static const std::string SPEECHRECOGNIZER_CAPABILITY_INTERFACE_VERSION = "2.3";
#endif
/// Configuration key used to give more details about the device configuration.
static const std::string CAPABILITY_INTERFACE_CONFIGURATIONS_KEY = "configurations";
/// Supported wake words key.
static const std::string CAPABILITY_INTERFACE_WAKE_WORDS_KEY = "wakeWords";
/// Supported maximum number of concurrent wakewords key.
static const std::string CAPABILITY_MAXIMUM_CONCURRENT_WAKEWORDS_KEY = "maximumConcurrentWakeWords";
/// The scope key for each wake words set.
static const std::string CAPABILITY_INTERFACE_SCOPES_KEY = "scopes";
/// The wake word values for a given scope.
static const std::string CAPABILITY_INTERFACE_VALUES_KEY = "values";
/// The scope configuration used as default locale wake words support.
static const std::string CAPABILITY_INTERFACE_DEFAULT_LOCALE = "DEFAULT";
/// String to identify log entries originating from this file.
static const std::string TAG("AudioInputProcessor");
/**
* Create a LogEntry using this file's TAG and the specified event string.
*
* @param The event string for this @c LogEntry.
*/
#define LX(event) alexaClientSDK::avsCommon::utils::logger::LogEntry(TAG, event)
/// The name of the @c FocusManager channel used by @c AudioInputProvider.
static const std::string CHANNEL_NAME = FocusManagerInterface::DIALOG_CHANNEL_NAME;
/// The namespace for this capability agent.
static const std::string NAMESPACE = "SpeechRecognizer";
/// The StopCapture directive signature.
static const avsCommon::avs::NamespaceAndName STOP_CAPTURE{NAMESPACE, "StopCapture"};
/// The ExpectSpeech directive signature.
static const avsCommon::avs::NamespaceAndName EXPECT_SPEECH{NAMESPACE, "ExpectSpeech"};
/// The SetEndOfSpeechOffset directive signature.
static const avsCommon::avs::NamespaceAndName SET_END_OF_SPEECH_OFFSET{NAMESPACE, "SetEndOfSpeechOffset"};
/// The SetWakeWordConfirmation directive signature.
static const avsCommon::avs::NamespaceAndName SET_WAKE_WORD_CONFIRMATION{NAMESPACE, "SetWakeWordConfirmation"};
/// The SetSpeechConfirmation directive signature.
static const avsCommon::avs::NamespaceAndName SET_SPEECH_CONFIRMATION{NAMESPACE, "SetSpeechConfirmation"};
/// The SetWakeWords directive signature.
static const avsCommon::avs::NamespaceAndName SET_WAKE_WORDS{NAMESPACE, "SetWakeWords"};
/// The field identifying the initiator.
static const std::string INITIATOR_KEY = "initiator";
/// The field identifying the initiator's profile.
static const std::string PROFILE_KEY = "profile";
/// The field identifying the initiator's format.
static const std::string FORMAT_KEY = "format";
/// The field identifying the initiator's type.
static const std::string TYPE_KEY = "type";
/// The field identifying the initiator's token.
static const std::string TOKEN_KEY = "token";
/// The field identifying the initiator's payload.
static const std::string PAYLOAD_KEY = "payload";
/// The field identifying the initiator's wakeword indices.
static const std::string WAKEWORD_INDICES_KEY = "wakeWordIndices";
/// The field identifying the initiator's wakeword start index.
static const std::string START_INDEX_KEY = "startIndexInSamples";
/// The field identifying the initiator's wakeword end index.
static const std::string END_INDEX_KEY = "endIndexInSamples";
/// The field identifying the initiator's wake word.
static const std::string WAKE_WORD_KEY = "wakeWord";
/// The field name for the user voice attachment.
static const std::string AUDIO_ATTACHMENT_FIELD_NAME = "audio";
/// The field name for the wake word engine metadata.
static const std::string KWD_METADATA_FIELD_NAME = "wakewordEngineMetadata";
/// The field name for the start of speech timestamp, reported in milliseconds since epoch. This field is provided to
/// the Recognize event and it's sent back as part of SetEndOfSpeechOffset payload.
static const std::string START_OF_SPEECH_TIMESTAMP_FIELD_NAME = "startOfSpeechTimestamp";
/// The field name for the end of speech offset, reported in milliseconds, as part of SetEndOfSpeechOffset payload.
static const std::string END_OF_SPEECH_OFFSET_FIELD_NAME = "endOfSpeechOffsetInMilliseconds";
/// The value of the WakeWordConfirmationChanged Event name.
static const std::string WAKE_WORD_CONFIRMATION_CHANGED_EVENT_NAME = "WakeWordConfirmationChanged";
/// The value of the WakeWordConfirmationReport Event name.
static const std::string WAKE_WORD_CONFIRMATION_REPORT_EVENT_NAME = "WakeWordConfirmationReport";
/// The value of the payload key for wakeWordConfirmation.
static const std::string WAKE_WORD_CONFIRMATION_PAYLOAD_KEY = "wakeWordConfirmation";
/// The value of the SpeechConfirmationChanged Event name.
static const std::string SPEECH_CONFIRMATION_CHANGED_EVENT_NAME = "SpeechConfirmationChanged";
/// The value of the SpeechConfirmationReport Event name.
static const std::string SPEECH_CONFIRMATION_REPORT_EVENT_NAME = "SpeechConfirmationReport";
/// The value of the payload key for speechConfirmation.
static const std::string SPEECH_CONFIRMATION_PAYLOAD_KEY = "speechConfirmation";
/// The value of the WakeWordsChanged Event name.
static const std::string WAKE_WORDS_CHANGED_EVENT_NAME = "WakeWordsChanged";
/// The value of the WakeWordsReport Event name.
static const std::string WAKE_WORDS_REPORT_EVENT_NAME = "WakeWordsReport";
/// The value of the payload key for wake words.
static const std::string WAKE_WORDS_PAYLOAD_KEY = "wakeWords";
/// The component name of power management
static const std::string POWER_RESOURCE_COMPONENT_NAME = "AudioInputProcessor";
/// Metric Activity Name Prefix for AIP metric source
static const std::string METRIC_ACTIVITY_NAME_PREFIX_AIP = "AIP-";
/// Start of Utterance Activity Name for AIP metric source
static const std::string START_OF_UTTERANCE = "START_OF_UTTERANCE";
static const std::string START_OF_UTTERANCE_ACTIVITY_NAME = METRIC_ACTIVITY_NAME_PREFIX_AIP + START_OF_UTTERANCE;
/// Name of audio bytes stream metric Opus
static const std::string WAKEWORD_DETECTION_SEGMENT_UPLOADED_OPUS = "AIP_WAKEWORD_DETECTION_SEGMENT_UPLOADED_OPUS";
/// Name of audio bytes stream metric PCM
static const std::string WAKEWORD_DETECTION_SEGMENT_UPLOADED_PCM = "AIP_WAKEWORD_DETECTION_SEGMENT_UPLOADED_PCM";
/// Recognize EVENT is built for AIP metric source
static const std::string RECOGNIZE_START_SEND_MESSAGE = "RECOGNIZE_EVENT_IS_BUILT";
/// Wakeword Activity Name for AIP metric source
static const std::string START_OF_STREAM_TIMESTAMP = "START_OF_STREAM_TIMESTAMP";
static const std::string WW_DURATION = "WW_DURATION";
static const std::string WW_DURATION_ACTIVITY_NAME = METRIC_ACTIVITY_NAME_PREFIX_AIP + WW_DURATION;
/// Stop Capture Received Activity Name for AIP metric source
static const std::string STOP_CAPTURE_RECEIVED = "STOP_CAPTURE";
static const std::string STOP_CAPTURE_RECEIVED_ACTIVITY_NAME = METRIC_ACTIVITY_NAME_PREFIX_AIP + STOP_CAPTURE_RECEIVED;
/// The duration metric for acquire power resource
static const std::string ACQUIRE_POWER_RESOURCE = "ACQUIRE_POWER_RESOURCE";
static const std::string ACQUIRE_POWER_RESOURCE_ACTIVITY = METRIC_ACTIVITY_NAME_PREFIX_AIP + ACQUIRE_POWER_RESOURCE;
/// The duration metric for release power resource
static const std::string RELEASE_POWER_RESOURCE = "RELEASE_POWER_RESOURCE";
static const std::string RELEASE_POWER_RESOURCE_ACTIVITY = METRIC_ACTIVITY_NAME_PREFIX_AIP + RELEASE_POWER_RESOURCE;
/// End of Speech Offset Received Activity Name for AIP metric source
static const std::string END_OF_SPEECH_OFFSET_RECEIVED = "END_OF_SPEECH_OFFSET";
static const std::string END_OF_SPEECH_OFFSET_RECEIVED_ACTIVITY_NAME =
METRIC_ACTIVITY_NAME_PREFIX_AIP + END_OF_SPEECH_OFFSET_RECEIVED;
/// The duration metric for short time out
static const std::string STOP_CAPTURE_TO_END_OF_SPEECH_METRIC_NAME = "STOP_CAPTURE_TO_END_OF_SPEECH";
static const std::string STOP_CAPTURE_TO_END_OF_SPEECH_ACTIVITY_NAME =
METRIC_ACTIVITY_NAME_PREFIX_AIP + STOP_CAPTURE_TO_END_OF_SPEECH_METRIC_NAME;
/// The recognize request initiator metric.
static const std::string INITIATOR_PREFIX = "INITIATOR_";
static const std::string INITIATOR_ACTIVITY_NAME_PREFIX = METRIC_ACTIVITY_NAME_PREFIX_AIP + INITIATOR_PREFIX;
/// AudioEncodingFormat Activity Name for AIP metric source
static const std::string AUDIO_ENCODING_FORMAT_METRIC_NAME = "AUDIO_ENCODING_FORMAT";
static const std::string AUDIO_ENCODING_FORMAT_ACTIVITY_NAME =
METRIC_ACTIVITY_NAME_PREFIX_AIP + AUDIO_ENCODING_FORMAT_METRIC_NAME;
static const std::string AUDIO_ENCODING_FORMAT_OPUS = "OPUSAudioEncoding";
static const std::string AUDIO_ENCODING_FORMAT_LPCM = "LPCMAudioEncoding";
/// The default resolveKey used as a placeholder when only one encoding format is configured for @c AudioInputProcessor
static const std::string DEFAULT_RESOLVE_KEY = "DEFAULT_RESOLVE_KEY";
/// Keys for instance entry metric specific fields
static const std::string ENTRY_METRIC_ACTOR_NAME = "AudioInputProcessor";
static const std::string ENTRY_METRIC_ACTIVITY_NAME = METRIC_ACTIVITY_NAME_PREFIX_AIP + ENTRY_METRIC_ACTOR_NAME;
static const std::string ENTRY_METRIC_KEY_SEGMENT_ID = "segment_id";
static const std::string ENTRY_METRIC_KEY_ACTOR = "actor";
static const std::string ENTRY_METRIC_KEY_ENTRY_TYPE = "entry_type";
static const std::string ENTRY_METRIC_KEY_ENTRY_NAME = "entry_name";
static const std::string ENTRY_METRIC_NAME_STATE_CHANGE = "StateChange";
/// Preroll duration is a fixed 500ms.
static const std::chrono::milliseconds PREROLL_DURATION = std::chrono::milliseconds(500);
static const int MILLISECONDS_PER_SECOND = 1000;
/// Threshold number of bytes for OPUS Encoded Wakeword detection
static const int WAKEWORD_DETECTION_SEGMENT_SIZE_BYTES_OPUS = 5209;
/// Threshold number of bytes for PCM Encoded Wakeword detection
static const int WAKEWORD_DETECTION_SEGMENT_SIZE_BYTES_PCM = 40480;
/**
* Helper function to get string values of encoding audio format, which are used in Recognize event.
* @param encoding Target encoding format
* @return String name of the encoding format
*/
static std::string encodingFormatToString(avsCommon::utils::AudioFormat::Encoding encoding) {
switch (encoding) {
case avsCommon::utils::AudioFormat::Encoding::OPUS:
return "OPUS";
case avsCommon::utils::AudioFormat::Encoding::LPCM:
return "AUDIO_L16_RATE_16000_CHANNELS_1";
}
return "UNKNOWN";
}
/**
* Handles a Metric event by creating and recording it. Failure to create or record the event results
* in an early return.
*
* @param metricRecorder The @c MetricRecorderInterface which records Metric events.
* @param metricEventBuilder The @c MetricEventBuilder.
* @param dialogRequestId The dialogRequestId associated with this Metric event; default is empty string.
*/
static void submitMetric(
const std::shared_ptr<MetricRecorderInterface>& metricRecorder,
MetricEventBuilder& metricEventBuilder,
const std::string& dialogRequestId = "") {
metricEventBuilder.addDataPoint(
DataPointStringBuilder{}.setName("DIALOG_REQUEST_ID").setValue(dialogRequestId).build());
auto metricEvent = metricEventBuilder.build();
if (metricEvent == nullptr) {
ACSDK_ERROR(LX("Error creating metric with explicit dialogRequestId"));
return;
}
recordMetric(metricRecorder, metricEvent);
}
/**
* Handles a Metric event by creating and recording it. Failure to create or record the event results
* in an early return.
*
* @param metricRecorder The @c MetricRecorderInterface which records Metric events.
* @param activityName The activityName of the Metric event.
* @param dataPoint The @c DataPoint of this Metric event (e.g. duration).
* @param directive The @c AVSDirective associated with this Metric event; default is nullptr.
*/
static void submitMetric(
const std::shared_ptr<MetricRecorderInterface> metricRecorder,
const std::string& activityName,
const DataPoint& dataPoint,
const std::shared_ptr<AVSDirective>& directive = nullptr) {
auto metricEventBuilder = MetricEventBuilder{}.setActivityName(activityName).addDataPoint(dataPoint);
if (directive != nullptr) {
metricEventBuilder.addDataPoint(
DataPointStringBuilder{}.setName("HTTP2_STREAM").setValue(directive->getAttachmentContextId()).build());
metricEventBuilder.addDataPoint(
DataPointStringBuilder{}.setName("DIRECTIVE_MESSAGE_ID").setValue(directive->getMessageId()).build());
metricEventBuilder.addDataPoint(
DataPointStringBuilder{}.setName("DIALOG_REQUEST_ID").setValue(directive->getDialogRequestId()).build());
}
auto metricEvent = metricEventBuilder.build();
if (metricEvent == nullptr) {
ACSDK_ERROR(LX("Error creating metric from directive"));
return;
}
recordMetric(metricRecorder, metricEvent);
}
/**
* Creates and records an instance entry metric with the given identifiers and metadata.
* @param metricRecorder The @c MetricRecorderInterface which records Metric events.
* @param segmentId The segmentId corresponding to this metric event.
* @param name The name of this metric
* @param metadata Any metadata to be associated with this metric; default is empty
*/
static void submitInstanceEntryMetric(
const std::shared_ptr<MetricRecorderInterface>& metricRecorder,
const std::string& segmentId,
const std::string& name,
const std::map<std::string, std::string>& metadata = {}) {
if (segmentId.empty() || name.empty()) {
ACSDK_ERROR(LX(__FUNCTION__).m("Unable to create instance metric").d("segmentId", segmentId).d("name", name));
return;
}
auto metricBuilder = MetricEventBuilder{}.setActivityName(ENTRY_METRIC_ACTIVITY_NAME);
metricBuilder.addDataPoint(
DataPointStringBuilder{}.setName(ENTRY_METRIC_KEY_SEGMENT_ID).setValue(segmentId).build());
metricBuilder.addDataPoint(
DataPointStringBuilder{}.setName(ENTRY_METRIC_KEY_ACTOR).setValue(ENTRY_METRIC_ACTOR_NAME).build());
metricBuilder.addDataPoint(DataPointStringBuilder{}.setName(ENTRY_METRIC_KEY_ENTRY_NAME).setValue(name).build());
metricBuilder.addDataPoint(
DataPointStringBuilder{}.setName(ENTRY_METRIC_KEY_ENTRY_TYPE).setValue("INSTANCE").build());
for (auto const& pair : metadata) {
metricBuilder.addDataPoint(DataPointStringBuilder{}.setName(pair.first).setValue(pair.second).build());
}
auto metric = metricBuilder.build();
if (metric == nullptr) {
ACSDK_ERROR(LX(__FUNCTION__).m("Error creating instance entry metric."));
return;
}
recordMetric(metricRecorder, metric);
}
/**
* Creates the SpeechRecognizer capability configuration.
*
* @param assetsManager Responsible for retrieving and changing the wake words and locale.
* @return The SpeechRecognizer capability configuration.
*/
static std::shared_ptr<avsCommon::avs::CapabilityConfiguration> getSpeechRecognizerCapabilityConfiguration(
const LocaleAssetsManagerInterface& assetsManager);
/**
* Resolving method to be called in lambda function that will be passed to @c MessageRequest as @c
* MessageRequestResolveFunction
* @param[in,out] request @c EditableMessageRequest to resolve
* @param resolveKey Provided resolve key
* @param encodingFormats Mappings from resolveKey to encoding format
* @param attachmentReaders Mappings from resolveKey to attachment readers
* @param metricRecorder @c MetricRecorderInterface object to emit metrics
* @return @c true if the message is resolved successfully, else @c false
*/
static bool resolveMessageRequest(
const std::shared_ptr<avsCommon::avs::EditableMessageRequest>& request,
const std::string& resolveKey,
AudioInputProcessor::EncodingFormatResponse encodingFormats,
const std::unordered_map<std::string, std::vector<std::shared_ptr<avsCommon::avs::MessageRequest::NamedReader>>>&
attachmentReaders,
const std::shared_ptr<avsCommon::utils::metrics::MetricRecorderInterface>& metricRecorder);
std::shared_ptr<AudioInputProcessor> AudioInputProcessor::create(
std::shared_ptr<DirectiveSequencerInterface> directiveSequencer,
std::shared_ptr<MessageSenderInterface> messageSender,
std::shared_ptr<ContextManagerInterface> contextManager,
std::shared_ptr<FocusManagerInterface> focusManager,
std::shared_ptr<avsCommon::avs::DialogUXStateAggregator> dialogUXStateAggregator,
std::shared_ptr<ExceptionEncounteredSenderInterface> exceptionEncounteredSender,
std::shared_ptr<UserInactivityMonitorInterface> userInactivityMonitor,
std::shared_ptr<SystemSoundPlayerInterface> systemSoundPlayer,
const std::shared_ptr<LocaleAssetsManagerInterface>& assetsManager,
std::shared_ptr<settings::WakeWordConfirmationSetting> wakeWordConfirmation,
std::shared_ptr<settings::SpeechConfirmationSetting> speechConfirmation,
const std::shared_ptr<avsCommon::avs::CapabilityChangeNotifierInterface>& capabilityChangeNotifier,
std::shared_ptr<settings::WakeWordsSetting> wakeWordsSetting,
std::shared_ptr<speechencoder::SpeechEncoder> speechEncoder,
AudioProvider defaultAudioProvider,
std::shared_ptr<PowerResourceManagerInterface> powerResourceManager,
std::shared_ptr<avsCommon::utils::metrics::MetricRecorderInterface> metricRecorder,
const std::shared_ptr<ExpectSpeechTimeoutHandler>& expectSpeechTimeoutHandler) {
if (!directiveSequencer) {
ACSDK_ERROR(LX("createFailed").d("reason", "nullDirectiveSequencer"));
return nullptr;
} else if (!messageSender) {
ACSDK_ERROR(LX("createFailed").d("reason", "nullMessageSender"));
return nullptr;
} else if (!contextManager) {
ACSDK_ERROR(LX("createFailed").d("reason", "nullContextManager"));
return nullptr;
} else if (!focusManager) {
ACSDK_ERROR(LX("createFailed").d("reason", "nullFocusManager"));
return nullptr;
} else if (!dialogUXStateAggregator) {
ACSDK_ERROR(LX("createFailed").d("reason", "nullDialogUXStateAggregator"));
return nullptr;
} else if (!exceptionEncounteredSender) {
ACSDK_ERROR(LX("createFailed").d("reason", "nullExceptionEncounteredSender"));
return nullptr;
} else if (!userInactivityMonitor) {
ACSDK_ERROR(LX("createFailed").d("reason", "nullUserInactivityMonitor"));
return nullptr;
} else if (!systemSoundPlayer) {
ACSDK_ERROR(LX("createFailed").d("reason", "nullSystemSoundPlayer"));
return nullptr;
} else if (!wakeWordConfirmation) {
ACSDK_ERROR(LX("createFailed").d("reason", "nullWakeWordsConfirmation"));
return nullptr;
} else if (!speechConfirmation) {
ACSDK_ERROR(LX("createFailed").d("reason", "nullSpeechConfirmation"));
return nullptr;
} else if (!assetsManager) {
ACSDK_ERROR(LX("createFailed").d("reason", "nullAssetsManager"));
return nullptr;
} else if (!assetsManager->getDefaultSupportedWakeWords().empty() && !wakeWordsSetting) {
ACSDK_ERROR(LX("createFailed").d("reason", "nullWakeWordsSetting"));
return nullptr;
} else if (!capabilityChangeNotifier) {
ACSDK_ERROR(LX("createFailed").d("reason", "nullCapabilityChangeNotifier"));
return nullptr;
}
auto capabilitiesConfiguration = getSpeechRecognizerCapabilityConfiguration(*assetsManager);
if (!capabilitiesConfiguration) {
ACSDK_ERROR(LX("createFailed").d("reason", "unableToCreateCapabilitiesConfiguration"));
return nullptr;
}
auto aip = std::shared_ptr<AudioInputProcessor>(new AudioInputProcessor(
directiveSequencer,
messageSender,
contextManager,
focusManager,
exceptionEncounteredSender,
userInactivityMonitor,
systemSoundPlayer,
assetsManager,
speechEncoder,
defaultAudioProvider,
wakeWordConfirmation,
speechConfirmation,
capabilityChangeNotifier,
wakeWordsSetting,
capabilitiesConfiguration,
powerResourceManager,
std::move(metricRecorder),
expectSpeechTimeoutHandler));
if (!aip->initialize()) {
ACSDK_ERROR(LX("createFailed").d("reason", "unableToInitialize"));
return nullptr;
}
if (aip) {
dialogUXStateAggregator->addObserver(aip);
}
return aip;
}
avsCommon::avs::DirectiveHandlerConfiguration AudioInputProcessor::getConfiguration() const {
avsCommon::avs::DirectiveHandlerConfiguration configuration;
configuration[STOP_CAPTURE] = BlockingPolicy(BlockingPolicy::MEDIUMS_NONE, false);
configuration[EXPECT_SPEECH] = BlockingPolicy(BlockingPolicy::MEDIUM_AUDIO, true);
configuration[SET_END_OF_SPEECH_OFFSET] = BlockingPolicy(BlockingPolicy::MEDIUMS_NONE, false);
configuration[SET_WAKE_WORD_CONFIRMATION] = BlockingPolicy(BlockingPolicy::MEDIUMS_NONE, false);
configuration[SET_SPEECH_CONFIRMATION] = BlockingPolicy(BlockingPolicy::MEDIUMS_NONE, false);
configuration[SET_WAKE_WORDS] = BlockingPolicy(BlockingPolicy::MEDIUMS_NONE, false);
return configuration;
}
void AudioInputProcessor::addObserver(std::shared_ptr<ObserverInterface> observer) {
if (!observer) {
ACSDK_ERROR(LX("addObserverFailed").d("reason", "nullObserver"));
return;
}
m_executor.submit([this, observer]() { m_observers.insert(observer); });
}
void AudioInputProcessor::removeObserver(std::shared_ptr<ObserverInterface> observer) {
if (!observer) {
ACSDK_ERROR(LX("removeObserverFailed").d("reason", "nullObserver"));
return;
}
m_executor.submit([this, observer]() { m_observers.erase(observer); }).wait();
}
std::future<bool> AudioInputProcessor::recognize(
AudioProvider audioProvider,
Initiator initiator,
steady_clock::time_point startOfSpeechTimestamp,
avsCommon::avs::AudioInputStream::Index begin,
avsCommon::avs::AudioInputStream::Index keywordEnd,
std::string keyword,
std::shared_ptr<const std::vector<char>> KWDMetadata,
const std::string& initiatorToken) {
ACSDK_METRIC_IDS(TAG, "Recognize", "", "", Metrics::Location::AIP_RECEIVE);
ACSDK_DEBUG5(LX(__func__));
std::string upperCaseKeyword = keyword;
std::transform(upperCaseKeyword.begin(), upperCaseKeyword.end(), upperCaseKeyword.begin(), ::toupper);
if (KEYWORD_TEXT_STOP == upperCaseKeyword) {
ACSDK_DEBUG(LX("skippingRecognizeEvent").d("reason", "invalidKeyword").d("keyword", keyword));
std::promise<bool> ret;
ret.set_value(false);
return ret.get_future();
}
// If no begin index was provided, grab the current index ASAP so that we can start streaming from the time this
// call was made.
if (audioProvider.stream && INVALID_INDEX == begin) {
static const bool startWithNewData = true;
auto reader = audioProvider.stream->createReader(
avsCommon::avs::AudioInputStream::Reader::Policy::NONBLOCKING, startWithNewData);
if (!reader) {
ACSDK_ERROR(LX("recognizeFailed").d("reason", "createReaderFailed"));
std::promise<bool> ret;
ret.set_value(false);
return ret.get_future();
}
begin = reader->tell();
}
return m_executor.submit([this,
audioProvider,
initiator,
startOfSpeechTimestamp,
begin,
keywordEnd,
keyword,
KWDMetadata,
initiatorToken]() {
return executeRecognize(
audioProvider, initiator, startOfSpeechTimestamp, begin, keywordEnd, keyword, KWDMetadata, initiatorToken);
});
}
std::future<bool> AudioInputProcessor::stopCapture() {
return m_executor.submit([this]() { return executeStopCapture(); });
}
std::future<void> AudioInputProcessor::resetState() {
ACSDK_DEBUG0(LX(__func__));
return m_executor.submit([this]() { executeResetState(); });
}
void AudioInputProcessor::onContextAvailable(const std::string& jsonContext) {
m_executor.submit([this, jsonContext]() { executeOnContextAvailable(jsonContext); });
}
void AudioInputProcessor::onContextFailure(const ContextRequestError error) {
m_executor.submit([this, error]() { executeOnContextFailure(error); });
}
void AudioInputProcessor::handleDirectiveImmediately(std::shared_ptr<avsCommon::avs::AVSDirective> directive) {
handleDirective(std::make_shared<DirectiveInfo>(directive, nullptr));
}
void AudioInputProcessor::preHandleDirective(std::shared_ptr<DirectiveInfo> info) {
}
void AudioInputProcessor::handleDirective(std::shared_ptr<DirectiveInfo> info) {
if (!info) {
ACSDK_ERROR(LX("handleDirectiveFailed").d("reason", "nullDirectiveInfo"));
return;
}
if (!info->directive) {
ACSDK_ERROR(LX("handleDirectiveFailed").d("reason", "nullDirective"));
return;
}
if (info->directive->getName() == STOP_CAPTURE.name) {
ACSDK_METRIC_MSG(TAG, info->directive, Metrics::Location::AIP_RECEIVE);
submitMetric(
m_metricRecorder,
STOP_CAPTURE_RECEIVED_ACTIVITY_NAME,
DataPointCounterBuilder{}.setName(STOP_CAPTURE_RECEIVED).increment(1).build(),
info->directive);
handleStopCaptureDirective(info);
} else if (info->directive->getName() == EXPECT_SPEECH.name) {
handleExpectSpeechDirective(info);
} else if (info->directive->getName() == SET_END_OF_SPEECH_OFFSET.name) {
handleSetEndOfSpeechOffsetDirective(info);
} else if (info->directive->getName() == SET_WAKE_WORD_CONFIRMATION.name) {
handleSetWakeWordConfirmation(info);
} else if (info->directive->getName() == SET_SPEECH_CONFIRMATION.name) {
handleSetSpeechConfirmation(info);
} else if (info->directive->getName() == SET_WAKE_WORDS.name) {
handleSetWakeWords(info);
} else {
std::string errorMessage =
"unexpected directive " + info->directive->getNamespace() + ":" + info->directive->getName();
m_exceptionEncounteredSender->sendExceptionEncountered(
info->directive->getUnparsedDirective(),
avsCommon::avs::ExceptionErrorType::UNEXPECTED_INFORMATION_RECEIVED,
errorMessage);
if (info->result) {
info->result->setFailed(errorMessage);
}
ACSDK_ERROR(LX("handleDirectiveFailed")
.d("reason", "unknownDirective")
.d("namespace", info->directive->getNamespace())
.d("name", info->directive->getName()));
removeDirective(info);
}
}
void AudioInputProcessor::cancelDirective(std::shared_ptr<DirectiveInfo> info) {
removeDirective(info);
}
void AudioInputProcessor::onDeregistered() {
resetState();
}
void AudioInputProcessor::onFocusChanged(avsCommon::avs::FocusState newFocus, avsCommon::avs::MixingBehavior behavior) {
ACSDK_DEBUG9(LX("onFocusChanged").d("newFocus", newFocus).d("MixingBehavior", behavior));
m_executor.submit([this, newFocus]() { executeOnFocusChanged(newFocus); });
}
void AudioInputProcessor::onDialogUXStateChanged(DialogUXStateObserverInterface::DialogUXState newState) {
m_executor.submit([this, newState]() { executeOnDialogUXStateChanged(newState); });
}
AudioInputProcessor::AudioInputProcessor(
std::shared_ptr<DirectiveSequencerInterface> directiveSequencer,
std::shared_ptr<MessageSenderInterface> messageSender,
std::shared_ptr<ContextManagerInterface> contextManager,
std::shared_ptr<FocusManagerInterface> focusManager,
std::shared_ptr<ExceptionEncounteredSenderInterface> exceptionEncounteredSender,
std::shared_ptr<UserInactivityMonitorInterface> userInactivityMonitor,
std::shared_ptr<SystemSoundPlayerInterface> systemSoundPlayer,
const std::shared_ptr<LocaleAssetsManagerInterface>& assetsManager,
std::shared_ptr<speechencoder::SpeechEncoder> speechEncoder,
AudioProvider defaultAudioProvider,
std::shared_ptr<settings::WakeWordConfirmationSetting> wakeWordConfirmation,
std::shared_ptr<settings::SpeechConfirmationSetting> speechConfirmation,
const std::shared_ptr<avsCommon::avs::CapabilityChangeNotifierInterface>& capabilityChangeNotifier,
std::shared_ptr<settings::WakeWordsSetting> wakeWordsSetting,
std::shared_ptr<avsCommon::avs::CapabilityConfiguration> capabilitiesConfiguration,
std::shared_ptr<PowerResourceManagerInterface> powerResourceManager,
std::shared_ptr<avsCommon::utils::metrics::MetricRecorderInterface> metricRecorder,
const std::shared_ptr<ExpectSpeechTimeoutHandler>& expectSpeechTimeoutHandler) :
CapabilityAgent{NAMESPACE, exceptionEncounteredSender},
RequiresShutdown{"AudioInputProcessor"},
m_metricRecorder{metricRecorder},
m_directiveSequencer{directiveSequencer},
m_messageSender{messageSender},
m_contextManager{contextManager},
m_focusManager{focusManager},
m_userInactivityMonitor{userInactivityMonitor},
m_encoder{speechEncoder},
m_defaultAudioProvider{defaultAudioProvider},
m_lastAudioProvider{AudioProvider::null()},
m_state{ObserverInterface::State::IDLE},
m_focusState{avsCommon::avs::FocusState::NONE},
m_preparingToSend{false},
m_initialDialogUXStateReceived{false},
m_localStopCapturePerformed{false},
m_streamIsClosedInRecognizingState{false},
m_systemSoundPlayer{systemSoundPlayer},
m_assetsManager{assetsManager},
m_precedingExpectSpeechInitiator{nullptr},
m_wakeWordConfirmation{wakeWordConfirmation},
m_speechConfirmation{speechConfirmation},
m_capabilityChangeNotifier{capabilityChangeNotifier},
m_wakeWordsSetting{wakeWordsSetting},
m_powerResourceManager{powerResourceManager},
m_expectSpeechTimeoutHandler{expectSpeechTimeoutHandler},
m_timeSinceLastResumeMS{std::chrono::milliseconds(0)},
m_timeSinceLastPartialMS{std::chrono::milliseconds(0)},
m_resourceFlags{0},
m_usingEncoder{false},
m_messageRequestResolver{nullptr},
m_encodingAudioFormats{{DEFAULT_RESOLVE_KEY, AudioFormat::Encoding::LPCM}} {
m_capabilityConfigurations.insert(capabilitiesConfiguration);
if (m_powerResourceManager) {
m_powerResourceId = m_powerResourceManager->create(
POWER_RESOURCE_COMPONENT_NAME, false, PowerResourceManagerInterface::PowerResourceLevel::ACTIVE_HIGH);
if (!m_powerResourceId) {
ACSDK_ERROR(
LX(__func__).d("reason", "createRawPowerResourceFailed").d("name", POWER_RESOURCE_COMPONENT_NAME));
}
}
}
/**
* Generate supported wake words json capability configuration for a given scope (default, language or locale).
*
* @param scope The scope being reported.
* @param wakeWordsCombination The set of a combination of wake words supported in the given scope.
* @return A json representation of the scope configuration.
*/
std::string generateSupportedWakeWordsJson(
const std::string& scope,
const LocaleAssetsManagerInterface::WakeWordsSets& wakeWordsCombination) {
json::JsonGenerator generator;
generator.addStringArray(CAPABILITY_INTERFACE_SCOPES_KEY, std::list<std::string>({scope}));
generator.addCollectionOfStringArray(CAPABILITY_INTERFACE_VALUES_KEY, wakeWordsCombination);
return generator.toString();
}
std::shared_ptr<avsCommon::avs::CapabilityConfiguration> getSpeechRecognizerCapabilityConfiguration(
const LocaleAssetsManagerInterface& assetsManager) {
std::unordered_map<std::string, std::string> configMap;
configMap.insert({CAPABILITY_INTERFACE_TYPE_KEY, SPEECHRECOGNIZER_CAPABILITY_INTERFACE_TYPE});
configMap.insert({CAPABILITY_INTERFACE_NAME_KEY, SPEECHRECOGNIZER_CAPABILITY_INTERFACE_NAME});
configMap.insert({CAPABILITY_INTERFACE_VERSION_KEY, SPEECHRECOGNIZER_CAPABILITY_INTERFACE_VERSION});
// Generate wake words capability configuration if supportedWakeWords is not empty.
auto defaultWakeWords = assetsManager.getDefaultSupportedWakeWords();
if (!defaultWakeWords.empty()) {
std::set<std::string> wakeWords;
wakeWords.insert(generateSupportedWakeWordsJson(CAPABILITY_INTERFACE_DEFAULT_LOCALE, defaultWakeWords));
for (const auto& entry : assetsManager.getLanguageSpecificWakeWords()) {
wakeWords.insert(generateSupportedWakeWordsJson(entry.first, entry.second));
}
for (const auto& entry : assetsManager.getLocaleSpecificWakeWords()) {
wakeWords.insert(generateSupportedWakeWordsJson(entry.first, entry.second));
}
json::JsonGenerator generator;
generator.addMembersArray(CAPABILITY_INTERFACE_WAKE_WORDS_KEY, wakeWords);
configMap.insert({CAPABILITY_INTERFACE_CONFIGURATIONS_KEY, generator.toString()});
ACSDK_DEBUG5(LX(__func__).d("wakeWords", generator.toString()));
}
return std::make_shared<avsCommon::avs::CapabilityConfiguration>(configMap);
}
settings::SettingEventMetadata AudioInputProcessor::getWakeWordsEventsMetadata() {
return settings::SettingEventMetadata{
NAMESPACE, WAKE_WORDS_CHANGED_EVENT_NAME, WAKE_WORDS_REPORT_EVENT_NAME, WAKE_WORDS_PAYLOAD_KEY};
}
void AudioInputProcessor::doShutdown() {
m_executor.shutdown();
executeResetState();
m_directiveSequencer.reset();
m_messageSender.reset();
m_contextManager.reset();
m_focusManager.reset();
m_userInactivityMonitor.reset();
m_observers.clear();
m_expectSpeechTimeoutHandler.reset();
if (m_powerResourceManager && m_powerResourceId) {
m_powerResourceManager->close(m_powerResourceId);
}
}
bool resolveMessageRequest(
const std::shared_ptr<EditableMessageRequest>& request,
const std::string& resolveKey,
AudioInputProcessor::EncodingFormatResponse encodingFormats,
const std::unordered_map<std::string, std::vector<std::shared_ptr<MessageRequest::NamedReader>>>& attachmentReaders,
const std::shared_ptr<avsCommon::utils::metrics::MetricRecorderInterface>& metricRecorder) {
if (!request) {
ACSDK_ERROR(LX("Failed to resolve MessageRequest")
.d("reason", "Null pointer to MessageRequest")
.d("value", resolveKey));
return false;
}
if (encodingFormats.find(resolveKey) == encodingFormats.end()) {
ACSDK_ERROR(LX("Failed to resolve MessageRequest").d("reason", "Invalid resolveKey").d("value", resolveKey));
return false;
}
rapidjson::Document eventWithContext;
jsonUtils::parseJSON(request->getJsonContent(), &eventWithContext);
auto event = eventWithContext.FindMember("event");
if (event == eventWithContext.MemberEnd()) {
ACSDK_ERROR(LX("Failed to resolve MessageRequest").d("reason", "No event found in json"));
return false;
}
auto payload = event->value.FindMember("payload");
if (payload == event->value.MemberEnd()) {
ACSDK_ERROR(LX("Failed to resolve MessageRequest").d("reason", "No payload in json"));
return false;
}
rapidjson::Value formatValue;
auto encodingFormat = encodingFormats.at(resolveKey);
auto formatString = encodingFormatToString(encodingFormat);
formatValue.SetString(formatString.c_str(), formatString.length());
if (payload->value.FindMember(FORMAT_KEY) != payload->value.MemberEnd()) {
ACSDK_WARN(LX("Format already exists in Json payload. Replace it with").d("format", formatString));
payload->value.FindMember(FORMAT_KEY)->value.SetString(formatString.c_str(), eventWithContext.GetAllocator());
} else {
payload->value.AddMember(
rapidjson::StringRef(FORMAT_KEY.c_str()), formatValue, eventWithContext.GetAllocator());
}
// emit metric
auto metricEvent = MetricEventBuilder{}
.setActivityName(AUDIO_ENCODING_FORMAT_ACTIVITY_NAME)
.addDataPoint(DataPointStringBuilder{}.setName("resolveKey").setValue(resolveKey).build());
switch (encodingFormat) {
case AudioFormat::Encoding::LPCM:
submitMetric(
metricRecorder,
metricEvent.addDataPoint(
DataPointCounterBuilder{}.setName(AUDIO_ENCODING_FORMAT_LPCM).increment(1).build()));
break;
case AudioFormat::Encoding::OPUS:
submitMetric(
metricRecorder,
metricEvent.addDataPoint(
DataPointCounterBuilder{}.setName(AUDIO_ENCODING_FORMAT_OPUS).increment(1).build()));
break;
default:
break;
}
rapidjson::StringBuffer buffer;
rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
eventWithContext.Accept(writer);
std::string jsonContent(buffer.GetString());
request->setJsonContent(jsonContent);
if (attachmentReaders.find(resolveKey) != attachmentReaders.end()) {
request->setAttachmentReaders(attachmentReaders.at(resolveKey));
}
return true;
}
std::future<bool> AudioInputProcessor::expectSpeechTimedOut() {
return m_executor.submit([this]() { return executeExpectSpeechTimedOut(); });
}
void AudioInputProcessor::handleStopCaptureDirective(std::shared_ptr<DirectiveInfo> info) {
m_stopCaptureReceivedTime = steady_clock::now();
m_executor.submit([this, info]() {
bool stopImmediately = true;
executeStopCapture(stopImmediately, info);
});
}
void AudioInputProcessor::handleExpectSpeechDirective(std::shared_ptr<DirectiveInfo> info) {
int64_t timeout;
bool found = avsCommon::utils::json::jsonUtils::retrieveValue(
info->directive->getPayload(), "timeoutInMilliseconds", &timeout);
if (!found) {
static const char* errorMessage = "missing/invalid timeoutInMilliseconds";
m_exceptionEncounteredSender->sendExceptionEncountered(
info->directive->getUnparsedDirective(),
avsCommon::avs::ExceptionErrorType::UNSUPPORTED_OPERATION,
errorMessage);
if (info->result) {
info->result->setFailed(errorMessage);
}
ACSDK_ERROR(LX("handleExpectSpeechDirectiveFailed")
.d("reason", "missingJsonField")
.d("field", "timeoutInMilliseconds"));
removeDirective(info);
return;
}
m_executor.submit([this, timeout, info]() { executeExpectSpeech(milliseconds{timeout}, info); });
}
void AudioInputProcessor::handleSetEndOfSpeechOffsetDirective(std::shared_ptr<DirectiveInfo> info) {
// The duration from StopCapture to SetEndOfSpeechOffset. END_OF_SPEECH_OFFSET_RECEIVED starts from Recognize,
// it is not helpful for figuring out the best SHORT timeout.
submitMetric(
m_metricRecorder,
STOP_CAPTURE_TO_END_OF_SPEECH_ACTIVITY_NAME,
DataPointDurationBuilder{duration_cast<milliseconds>(steady_clock::now() - m_stopCaptureReceivedTime)}
.setName(STOP_CAPTURE_TO_END_OF_SPEECH_METRIC_NAME)
.build(),
info->directive);
auto payload = info->directive->getPayload();
int64_t endOfSpeechOffset = 0;
std::string startOfSpeechTimeStampInString;
bool foundEnd = json::jsonUtils::retrieveValue(payload, END_OF_SPEECH_OFFSET_FIELD_NAME, &endOfSpeechOffset);
bool foundStart =
json::jsonUtils::retrieveValue(payload, START_OF_SPEECH_TIMESTAMP_FIELD_NAME, &startOfSpeechTimeStampInString);
if (foundEnd && foundStart) {
int64_t startOfSpeechTimestamp = 0;
auto offset = milliseconds(endOfSpeechOffset);
submitMetric(
m_metricRecorder,
END_OF_SPEECH_OFFSET_RECEIVED_ACTIVITY_NAME,
DataPointDurationBuilder{offset}.setName(END_OF_SPEECH_OFFSET_RECEIVED).build(),
info->directive);
std::istringstream iss{startOfSpeechTimeStampInString};
iss >> startOfSpeechTimestamp;
ACSDK_DEBUG0(LX("handleSetEndOfSpeechOffsetDirective")
.d("startTimeSpeech(ms)", startOfSpeechTimestamp)
.d("endTimeSpeech(ms)", startOfSpeechTimestamp + endOfSpeechOffset));
info->result->setCompleted();
} else {
std::string missing;
if (!foundEnd && !foundStart) {
missing = END_OF_SPEECH_OFFSET_FIELD_NAME + " and " + START_OF_SPEECH_TIMESTAMP_FIELD_NAME;
} else if (!foundEnd) {
missing = END_OF_SPEECH_OFFSET_FIELD_NAME;
} else {
missing = START_OF_SPEECH_TIMESTAMP_FIELD_NAME;
}
ACSDK_ERROR(LX("handleSetEndOfSpeechOffsetDirective").d("missing", missing));
info->result->setFailed("Missing parameter(s): " + missing);
}
removeDirective(info);
}
bool AudioInputProcessor::executeRecognize(
AudioProvider provider,
Initiator initiator,
steady_clock::time_point startOfSpeechTimestamp,
avsCommon::avs::AudioInputStream::Index begin,
avsCommon::avs::AudioInputStream::Index end,
const std::string& keyword,
std::shared_ptr<const std::vector<char>> KWDMetadata,
const std::string& initiatorToken) {
// Make sure we have a keyword if this is a wakeword initiator.
if (Initiator::WAKEWORD == initiator && keyword.empty()) {
ACSDK_ERROR(LX("executeRecognizeFailed").d("reason", "emptyKeywordWithWakewordInitiator"));
return false;
}
// 500ms preroll.
avsCommon::avs::AudioInputStream::Index preroll = provider.format.sampleRateHz / 2;
// Check if we have everything we need to enable false wakeword detection.
// TODO: Consider relaxing the hard requirement for a full 500ms preroll - ACSDK-276.
bool falseWakewordDetection =
Initiator::WAKEWORD == initiator && begin != INVALID_INDEX && begin >= preroll && end != INVALID_INDEX;
// If we will be enabling false wakeword detection, add preroll and build the initiator payload.
json::JsonGenerator generator;
std::string initiatorString = initiatorToString(initiator);
generator.addMember(TYPE_KEY, initiatorString);
generator.startObject(PAYLOAD_KEY);
// If we will be enabling false wakeword detection, add preroll and build the initiator payload.
if (falseWakewordDetection) {
generator.startObject(WAKEWORD_INDICES_KEY);
generator.addMember(START_INDEX_KEY, preroll);
generator.addMember(END_INDEX_KEY, preroll + (end - begin));
generator.finishObject();
begin -= preroll;
}
if (!keyword.empty()) {
generator.addMember(WAKE_WORD_KEY, string::stringToUpperCase(keyword));
}
if (!initiatorToken.empty()) {
generator.addMember(TOKEN_KEY, initiatorToken);
}
bool initiatedByWakeword = (Initiator::WAKEWORD == initiator) ? true : false;
return executeRecognize(
provider,
generator.toString(),
startOfSpeechTimestamp,
begin,
end,
keyword,
KWDMetadata,
initiatedByWakeword,
falseWakewordDetection,
initiatorString);
}
bool AudioInputProcessor::executeRecognize(
AudioProvider provider,
const std::string& initiatorJson,
steady_clock::time_point startOfSpeechTimestamp,
avsCommon::avs::AudioInputStream::Index begin,
avsCommon::avs::AudioInputStream::Index end,
const std::string& keyword,
std::shared_ptr<const std::vector<char>> KWDMetadata,
bool initiatedByWakeword,
bool falseWakewordDetection,
const std::string& initiatorString) {
if (!provider.stream) {
ACSDK_ERROR(LX("executeRecognizeFailed").d("reason", "nullAudioInputStream"));
return false;
}
std::unordered_map<int, std::string> mapSampleRatesAVSEncoding = {{32000, "OPUS"}};
std::string avsEncodingFormat;
std::unordered_map<int, std::string>::iterator itSampleRateAVSEncoding;
switch (provider.format.encoding) {
case avsCommon::utils::AudioFormat::Encoding::LPCM:
if (provider.format.sampleRateHz != 16000) {
ACSDK_ERROR(LX("executeRecognizeFailed")
.d("reason", "unsupportedSampleRateForPCM")
.d("sampleRate", provider.format.sampleRateHz));
return false;
} else if (provider.format.sampleSizeInBits != 16) {
ACSDK_ERROR(LX("executeRecognizeFailed")
.d("reason", "unsupportedSampleSize")
.d("sampleSize", provider.format.sampleSizeInBits));
return false;
}
avsEncodingFormat = encodingFormatToString(provider.format.encoding);
m_audioBytesForMetricThreshold = WAKEWORD_DETECTION_SEGMENT_SIZE_BYTES_PCM;
m_uploadMetricName = WAKEWORD_DETECTION_SEGMENT_UPLOADED_PCM;
break;
case avsCommon::utils::AudioFormat::Encoding::OPUS:
itSampleRateAVSEncoding = mapSampleRatesAVSEncoding.find(provider.format.sampleRateHz);
if (itSampleRateAVSEncoding == mapSampleRatesAVSEncoding.end()) {
ACSDK_ERROR(LX("executeRecognizeFailed")
.d("reason", "unsupportedSampleRateForOPUS")
.d("sampleRate", provider.format.sampleRateHz));
return false;
}
avsEncodingFormat = encodingFormatToString(provider.format.encoding);
m_audioBytesForMetricThreshold = WAKEWORD_DETECTION_SEGMENT_SIZE_BYTES_OPUS;
m_uploadMetricName = WAKEWORD_DETECTION_SEGMENT_UPLOADED_OPUS;
break;
default:
ACSDK_ERROR(LX("executeRecognizeFailed")
.d("reason", "unsupportedEncoding")
.d("encoding", provider.format.encoding));
return false;
}
if (provider.format.endianness != avsCommon::utils::AudioFormat::Endianness::LITTLE) {
ACSDK_ERROR(LX("executeRecognizeFailed")
.d("reason", "unsupportedEndianness")
.d("endianness", provider.format.endianness));
return false;
} else if (provider.format.numChannels != 1) {
ACSDK_ERROR(LX("executeRecognizeFailed")
.d("reason", "unsupportedNumChannels")
.d("channels", provider.format.numChannels));
return false;
}
// If this is a barge-in, verify that it is permitted.
switch (m_state) {
case ObserverInterface::State::IDLE:
case ObserverInterface::State::EXPECTING_SPEECH:
break;
case ObserverInterface::State::RECOGNIZING:
// Barge-in is only permitted if the audio providers have compatible policies.
if (!m_lastAudioProvider.canBeOverridden) {
ACSDK_ERROR(LX("executeRecognizeFailed").d("reason", "Active audio provider can not be overridden"));
return false;
}
if (!provider.canOverride) {
ACSDK_ERROR(LX("executeRecognizeFailed").d("reason", "New audio provider can not override"));
return false;
}
// For barge-in, we should close the previous reader before creating another one.
if (m_usingEncoder) {
m_encoder->stopEncoding(true);
}
closeAttachmentReaders();
break;
case ObserverInterface::State::BUSY:
ACSDK_ERROR(LX("executeRecognizeFailed").d("reason", "Barge-in is not permitted while busy"));
return false;
}
if (settings::WakeWordConfirmationSettingType::TONE == m_wakeWordConfirmation->get()) {
m_executor.submit(
[this]() { m_systemSoundPlayer->playTone(SystemSoundPlayerInterface::Tone::WAKEWORD_NOTIFICATION); });
}
// Mutex required since following operations depend on value of @c m_encodingAudioFormats
std::lock_guard<std::mutex> lock(m_encodingFormatMutex);
// Assemble the event payload.
json::JsonGenerator payloadGenerator;
payloadGenerator.addMember(PROFILE_KEY, asrProfileToString(provider.profile));
m_usingEncoder = isUsingEncoderLocked();
// The initiator (or lack thereof) from a previous ExpectSpeech has precedence.
if (m_precedingExpectSpeechInitiator) {
if (!m_precedingExpectSpeechInitiator->empty()) {
payloadGenerator.addRawJsonMember(INITIATOR_KEY, *m_precedingExpectSpeechInitiator);
}
m_precedingExpectSpeechInitiator.reset();
} else if (!initiatorJson.empty()) {
payloadGenerator.addRawJsonMember(INITIATOR_KEY, initiatorJson);
}
payloadGenerator.addMember(
START_OF_SPEECH_TIMESTAMP_FIELD_NAME, std::to_string(startOfSpeechTimestamp.time_since_epoch().count()));
// Set up an attachment reader for the event.
avsCommon::avs::attachment::InProcessAttachmentReader::SDSTypeIndex offset = 0;
auto reference = AudioInputStream::Reader::Reference::BEFORE_WRITER;
if (INVALID_INDEX != begin) {
offset = begin;
reference = AudioInputStream::Reader::Reference::ABSOLUTE;
}
avsCommon::avs::AudioInputStream::Index encodingOffset = 0;
AudioInputStream::Reader::Reference encodingReference = AudioInputStream::Reader::Reference::ABSOLUTE;
// Set up the speech encoder
if (m_usingEncoder) {
ACSDK_DEBUG(LX("encodingAudio").d("format", avsEncodingFormat));
if (!m_encoder->startEncoding(provider.stream, provider.format, offset, reference)) {
ACSDK_ERROR(LX("executeRecognizeFailed").d("reason", "Failed to start encoder"));
return false;
}
} else {
ACSDK_DEBUG(LX("notEncodingAudio"));
}
if (KWDMetadata) {
for (auto& it : m_encodingAudioFormats) {
std::shared_ptr<attachment::AttachmentReader> metadataReader =
attachment::AttachmentUtils::createAttachmentReader(*KWDMetadata);
if (!metadataReader) {
ACSDK_ERROR(LX("sendingKWDMetadataFailed").d("reason", "Failed to create attachment reader"));
// If fail to create any metadata reader, clear all existing metadata for consistency
closeAttachmentReaders(attachment::AttachmentReader::ClosePoint::IMMEDIATELY);
break;
}
auto resolveKey = it.first;
m_attachmentReaders[resolveKey] = {
std::make_shared<MessageRequest::NamedReader>(KWD_METADATA_FIELD_NAME, metadataReader)};
}
}
// N audio streams of different encodings
for (auto& it : m_encodingAudioFormats) {
ACSDK_DEBUG(
LX("Create audio reader").d("format", it.second).d("offset", offset).d("encodingOffset", encodingOffset));
const auto audioFormat = it.second;
auto isLPCMEncodingAudioFormat = (AudioFormat::Encoding::LPCM == audioFormat);
std::shared_ptr<attachment::AttachmentReader> audioReader =
attachment::DefaultAttachmentReader<AudioInputStream>::create(
sds::ReaderPolicy::NONBLOCKING,
isLPCMEncodingAudioFormat ? provider.stream : m_encoder->getEncodedStream(),
isLPCMEncodingAudioFormat ? offset : encodingOffset,
isLPCMEncodingAudioFormat ? reference : encodingReference);
if (!audioReader) {
ACSDK_ERROR(LX("executeRecognizeFailed").d("reason", "Failed to create attachment reader"));
closeAttachmentReaders();
return false;
}
auto resolveKey = it.first;
m_attachmentReaders[resolveKey].push_back(
std::make_shared<MessageRequest::NamedReader>(AUDIO_ATTACHMENT_FIELD_NAME, audioReader));
ACSDK_INFO(LX("Create audio attachment reader success")
.d("resolveKey", resolveKey)
.d("format", encodingFormatToString(it.second)));
}
if (!multiStreamsRequestedLocked()) {
m_messageRequestResolver = nullptr;
// Set up format for single audio stream request
if (m_usingEncoder && m_encoder->getContext()) {
avsEncodingFormat = m_encoder->getContext()->getAVSFormatName();
}
payloadGenerator.addMember(FORMAT_KEY, avsEncodingFormat);
// emit audio encoding format metric
if (!m_encodingAudioFormats.empty()) {
auto metricEvent = MetricEventBuilder{}.setActivityName(AUDIO_ENCODING_FORMAT_ACTIVITY_NAME);
auto encodingAudioFormat = m_encodingAudioFormats.begin()->second;
switch (encodingAudioFormat) {
case avsCommon::utils::AudioFormat::Encoding::OPUS:
submitMetric(
m_metricRecorder,
metricEvent.addDataPoint(
DataPointCounterBuilder{}.setName(AUDIO_ENCODING_FORMAT_OPUS).increment(1).build()));
m_audioBytesForMetricThreshold = WAKEWORD_DETECTION_SEGMENT_SIZE_BYTES_OPUS;
m_uploadMetricName = WAKEWORD_DETECTION_SEGMENT_UPLOADED_OPUS;
break;
case avsCommon::utils::AudioFormat::Encoding::LPCM:
submitMetric(
m_metricRecorder,
metricEvent.addDataPoint(
DataPointCounterBuilder{}.setName(AUDIO_ENCODING_FORMAT_LPCM).increment(1).build()));
m_audioBytesForMetricThreshold = WAKEWORD_DETECTION_SEGMENT_SIZE_BYTES_PCM;
m_uploadMetricName = WAKEWORD_DETECTION_SEGMENT_UPLOADED_PCM;
break;
default:
ACSDK_WARN(LX("executeRecognize")
.m("failed to emit audio encoding format metric")
.d("reason", "unsupportedEncoding")
.d("encoding", encodingFormatToString(encodingAudioFormat)));
break;
}
}
} else {
m_messageRequestResolver = getMessageRequestResolverLocked();
}
// Code below this point changes the state of AIP. Formally update state now, and don't error out without calling
// executeResetState() after this point.
m_preCachedDialogRequestId = uuidGeneration::generateUUID();
setState(ObserverInterface::State::RECOGNIZING);
// Note that we're preparing to send a Recognize event.
m_preparingToSend = true;
// Reset flag when we send a new recognize event.
m_localStopCapturePerformed = false;
m_streamIsClosedInRecognizingState = false;
// Start assembling the context; we'll service the callback after assembling our Recognize event.
m_contextManager->getContextWithoutReportableStateProperties(shared_from_this());
// Stop the ExpectSpeech timer so we don't get a timeout.
m_expectingSpeechTimer.stop();
// Record provider as the last-used AudioProvider so it can be used in the event of an ExpectSpeech directive.
m_lastAudioProvider = provider;
// Record the Recognize event payload for later use by executeContextAvailable().
m_recognizePayload = payloadGenerator.toString();
// We can't assemble the MessageRequest until we receive the context.
m_recognizeRequest.reset();
// Handle metrics for this event.
submitMetric(
m_metricRecorder,
MetricEventBuilder{}
.setActivityName(START_OF_UTTERANCE_ACTIVITY_NAME)
.addDataPoint(DataPointStringBuilder{}
.setName("TIME_SINCE_RESUME_ID")
.setValue(std::to_string(m_timeSinceLastResumeMS.count()))
.build())
.addDataPoint(DataPointStringBuilder{}
.setName("TIME_SINCE_PARTIAL_ID")
.setValue(std::to_string(m_timeSinceLastPartialMS.count()))
.build())
.addDataPoint(DataPointStringBuilder{}
.setName("RESOURCE_TYPE_ID")
.setValue(std::to_string(m_resourceFlags.to_ulong()))
.build())
.addDataPoint(DataPointCounterBuilder{}.setName(START_OF_UTTERANCE).increment(1).build()),
m_preCachedDialogRequestId);
/// Submit initiator metric if available.
if (!initiatorString.empty()) {
submitMetric(
m_metricRecorder,
MetricEventBuilder{}
.setActivityName(INITIATOR_ACTIVITY_NAME_PREFIX + initiatorString)
.addDataPoint(
DataPointCounterBuilder{}.setName(INITIATOR_PREFIX + initiatorString).increment(1).build()),
m_preCachedDialogRequestId);
}
if (initiatedByWakeword) {
auto duration = milliseconds((end - begin) * MILLISECONDS_PER_SECOND / provider.format.sampleRateHz);
auto startOfStreamTimestamp = startOfSpeechTimestamp;
if (falseWakewordDetection) {
startOfStreamTimestamp -= PREROLL_DURATION;
}
submitMetric(
m_metricRecorder,
MetricEventBuilder{}
.setActivityName(WW_DURATION_ACTIVITY_NAME)
.addDataPoint(DataPointDurationBuilder{duration}.setName(WW_DURATION).build())
.addDataPoint(
DataPointDurationBuilder{duration_cast<milliseconds>(startOfStreamTimestamp.time_since_epoch())}
.setName(START_OF_STREAM_TIMESTAMP)
.build())
.addDataPoint(DataPointCounterBuilder{}.setName(RECOGNIZE_START_SEND_MESSAGE).increment(1).build()),
m_preCachedDialogRequestId);
ACSDK_DEBUG(LX(__func__).d("WW_DURATION(ms)", duration.count()));
}
submitInstanceEntryMetric(
m_metricRecorder,
m_preCachedDialogRequestId,
START_OF_UTTERANCE,
std::map<std::string, std::string>{{"initiator", !initiatorString.empty() ? initiatorString : "unknown"}});
return true;
}
void AudioInputProcessor::executeOnContextAvailable(const std::string& jsonContext) {
ACSDK_DEBUG(LX("executeOnContextAvailable").sensitive("jsonContext", jsonContext));
// Should already be RECOGNIZING if we get here.
if (m_state != ObserverInterface::State::RECOGNIZING) {
ACSDK_ERROR(
LX("executeOnContextAvailableFailed").d("reason", "Not permitted in current state").d("state", m_state));
return;
}
// Should already have a reader.
if (m_attachmentReaders.empty()) {
ACSDK_ERROR(LX("executeOnContextAvailableFailed").d("reason", "nullReader"));
executeResetState();
return;
}
// Recognize payload should not be empty.
if (m_recognizePayload.empty()) {
ACSDK_ERROR(LX("executeOnContextAvailableFailed").d("reason", "payloadEmpty"));
executeResetState();
return;
}
// Start acquiring the channel right away; we'll service the callback after assembling our Recognize event.
if (m_focusState != avsCommon::avs::FocusState::FOREGROUND) {
auto activity = FocusManagerInterface::Activity::create(
NAMESPACE, shared_from_this(), std::chrono::milliseconds::zero(), avsCommon::avs::ContentType::MIXABLE);
if (!m_focusManager->acquireChannel(CHANNEL_NAME, activity)) {
ACSDK_ERROR(LX("executeOnContextAvailableFailed").d("reason", "Unable to acquire channel"));
executeResetState();
return;
}
}
if (!m_preCachedDialogRequestId.empty()) {
m_directiveSequencer->setDialogRequestId(m_preCachedDialogRequestId);
m_preCachedDialogRequestId.clear();
}
// Assemble the MessageRequest. It will be sent by executeOnFocusChanged when we acquire the channel.
auto msgIdAndJsonEvent =
buildJsonEventString("Recognize", m_directiveSequencer->getDialogRequestId(), m_recognizePayload, jsonContext);
if (m_messageRequestResolver) {
// Create unresolved MessageRequest
m_recognizeRequest = std::make_shared<MessageRequest>(
msgIdAndJsonEvent.second,
true,
"",
std::vector<std::pair<std::string, std::string>>{},
m_messageRequestResolver,
m_audioBytesForMetricThreshold,
m_uploadMetricName);
} else {
m_recognizeRequest = std::make_shared<MessageRequest>(
msgIdAndJsonEvent.second, m_audioBytesForMetricThreshold, m_uploadMetricName);
// Only add attachment readers for resolved MessageRequest. For unresolved one, attachment readers will be
// passed to the resolver function.
if (!m_attachmentReaders.empty() && !m_attachmentReaders.begin()->second.empty()) {
for (auto& namedReader : m_attachmentReaders.begin()->second) {
m_recognizeRequest->addAttachmentReader(namedReader->name, namedReader->reader);
}
}
}
// If we already have focus, there won't be a callback to send the message, so send it now.
if (avsCommon::avs::FocusState::FOREGROUND == m_focusState) {
sendRequestNow();
}
}
void AudioInputProcessor::executeOnContextFailure(const ContextRequestError error) {
ACSDK_ERROR(LX("executeOnContextFailure").d("error", error));
executeResetState();
}
void AudioInputProcessor::executeOnFocusChanged(avsCommon::avs::FocusState newFocus) {
ACSDK_DEBUG(LX("executeOnFocusChanged").d("newFocus", newFocus));
// Note new focus state.
m_focusState = newFocus;
// If we're losing focus, stop using the channel.
if (newFocus != avsCommon::avs::FocusState::FOREGROUND) {
ACSDK_DEBUG(LX("executeOnFocusChanged").d("reason", "Lost focus"));
executeResetState();
return;
}
// We're not losing the channel (m_focusState == avsCommon::avs::FocusState::FOREGROUND). For all
// states except RECOGNIZING, there's nothing more to do here.
if (m_state != ObserverInterface::State::RECOGNIZING) {
return;
}
// For a focus change to FOREGROUND in the Recognizing state, we may have a message queued up to send. If we do,
// we can safely send it now.
sendRequestNow();
}
bool AudioInputProcessor::executeStopCapture(bool stopImmediately, std::shared_ptr<DirectiveInfo> info) {
if (info && info->isCancelled) {
ACSDK_DEBUG(LX("stopCaptureIgnored").d("reason", "isCancelled"));
return true;
}
if (m_state != ObserverInterface::State::RECOGNIZING) {
auto returnValue = false;
if (info) {
if (info->result) {
if (m_localStopCapturePerformed || m_streamIsClosedInRecognizingState) {
// Since a local StopCapture was performed or the event stream is closed while in RECOGNIZING state,
// we can safely ignore the StopCapture from AVS.
ACSDK_INFO(LX("executeStopCapture")
.d("localStopCapturePerformed", m_localStopCapturePerformed)
.d("streamIsClosedInRecognizingState", m_streamIsClosedInRecognizingState)
.m("StopCapture directive ignored because local StopCapture was performed or event"
" stream is closed while in recognizing state."));
m_localStopCapturePerformed = false;
m_streamIsClosedInRecognizingState = false;
returnValue = true;
info->result->setCompleted();
} else {
static const char* errorMessage = "StopCapture only allowed in RECOGNIZING state.";
info->result->setFailed(errorMessage);
ACSDK_ERROR(LX("executeStopCaptureFailed")
.d("reason", "invalidState")
.d("expectedState", "RECOGNIZING")
.d("state", m_state));
}
}
removeDirective(info);
}
return returnValue;
}
// If info is nullptr, this indicates that a local StopCapture is performed.
if (nullptr == info) {
m_localStopCapturePerformed = true;
}
// Create a lambda to do the StopCapture.
std::function<void()> stopCapture = [=] {
ACSDK_DEBUG(LX("stopCapture").d("stopImmediately", stopImmediately));
if (m_usingEncoder) {
// If SpeechEncoder is enabled, let it finish it so the stream will be closed automatically.
m_encoder->stopEncoding(stopImmediately);
m_usingEncoder = false;
}
auto closePoint = stopImmediately ? attachment::AttachmentReader::ClosePoint::IMMEDIATELY
: attachment::AttachmentReader::ClosePoint::AFTER_DRAINING_CURRENT_BUFFER;
closeAttachmentReaders(closePoint);
setState(ObserverInterface::State::BUSY);
if (info) {
if (info->result) {
info->result->setCompleted();
}
removeDirective(info);
}
if (m_speechConfirmation->get() == settings::SpeechConfirmationSettingType::TONE) {
m_systemSoundPlayer->playTone(SystemSoundPlayerInterface::Tone::END_SPEECH);
}
};
if (m_preparingToSend) {
// If we're still preparing to send, we'll save the lambda and call it after we start sending.
m_deferredStopCapture = stopCapture;
} else {
// Otherwise, we can call the lambda now.
stopCapture();
}
return true;
}
void AudioInputProcessor::executeResetState() {
// Irrespective of current state, clean up and go back to idle.
ACSDK_DEBUG(LX(__func__));
m_expectingSpeechTimer.stop();
m_precedingExpectSpeechInitiator.reset();
closeAttachmentReaders();
if (m_encoder) {
m_encoder->stopEncoding(true);
}
if (m_recognizeRequestSent) {
m_recognizeRequestSent->removeObserver(shared_from_this());
m_recognizeRequestSent.reset();
}
m_recognizeRequest.reset();
m_preparingToSend = false;
m_deferredStopCapture = nullptr;
if (m_focusState != avsCommon::avs::FocusState::NONE) {
m_focusManager->releaseChannel(CHANNEL_NAME, shared_from_this());
}
m_focusState = avsCommon::avs::FocusState::NONE;
setState(ObserverInterface::State::IDLE);
m_audioBytesForMetricThreshold = 0;
}
bool AudioInputProcessor::executeExpectSpeech(milliseconds timeout, std::shared_ptr<DirectiveInfo> info) {
if (info && info->isCancelled) {
ACSDK_DEBUG(LX("expectSpeechIgnored").d("reason", "isCancelled"));
return true;
}
if (m_state != ObserverInterface::State::IDLE && m_state != ObserverInterface::State::BUSY) {
if (info->result) {
static const char* errorMessage = "ExpectSpeech only allowed in IDLE or BUSY state.";
info->result->setFailed(errorMessage);
}
removeDirective(info);
ACSDK_ERROR(LX("executeExpectSpeechFailed")
.d("reason", "invalidState")
.d("expectedState", "IDLE/BUSY")
.d("state", m_state));
return false;
}
if (m_precedingExpectSpeechInitiator) {
ACSDK_ERROR(LX(__func__)
.d("reason", "precedingExpectSpeechInitiatorUnconsumed")
.d(INITIATOR_KEY.c_str(), *m_precedingExpectSpeechInitiator));
}
m_precedingExpectSpeechInitiator = memory::make_unique<std::string>("");
bool found = json::jsonUtils::retrieveValue(
info->directive->getPayload(), INITIATOR_KEY, m_precedingExpectSpeechInitiator.get());
if (found) {
ACSDK_DEBUG(LX(__func__).d("initiatorFound", *m_precedingExpectSpeechInitiator));
} else {
*m_precedingExpectSpeechInitiator = "";
}
setState(ObserverInterface::State::EXPECTING_SPEECH);
if (info->result) {
info->result->setCompleted();
}
removeDirective(info);
if (m_expectSpeechTimeoutHandler &&
m_expectSpeechTimeoutHandler->handleExpectSpeechTimeout(timeout, [this]() { return expectSpeechTimedOut(); })) {
ACSDK_DEBUG(LX(__func__).m("Expect speech timeout being handled externally"));
// Let the ExpectSpeech timeout will be handled externally.
} else {
ACSDK_DEBUG(LX(__func__).m("Expect speech timeout being handled internally"));
// Start the ExpectSpeech timer.
if (!m_expectingSpeechTimer.start(timeout, std::bind(&AudioInputProcessor::expectSpeechTimedOut, this))
.valid()) {
ACSDK_ERROR(LX("executeExpectSpeechFailed").d("reason", "startTimerFailed"));
}
}
// If possible, start recognizing immediately.
if (m_lastAudioProvider && m_lastAudioProvider.alwaysReadable) {
return executeRecognize(m_lastAudioProvider, "");
} else if (m_defaultAudioProvider && m_defaultAudioProvider.alwaysReadable) {
return executeRecognize(m_defaultAudioProvider, "");
}
return true;
}
bool AudioInputProcessor::executeExpectSpeechTimedOut() {
if (m_state != ObserverInterface::State::EXPECTING_SPEECH) {
ACSDK_ERROR(LX("executeExpectSpeechTimedOutFailure")
.d("reason", "invalidState")
.d("expectedState", "EXPECTING_SPEECH")
.d("state", m_state));
return false;
}
m_precedingExpectSpeechInitiator.reset();
auto msgIdAndJsonEvent = buildJsonEventString("ExpectSpeechTimedOut");
auto request = std::make_shared<MessageRequest>(msgIdAndJsonEvent.second);
request->addObserver(shared_from_this());
m_messageSender->sendMessage(request);
setState(ObserverInterface::State::IDLE);
ACSDK_ERROR(LX("executeExpectSpeechFailed").d("reason", "Timed Out"));
return true;
}
void AudioInputProcessor::executeOnDialogUXStateChanged(DialogUXStateObserverInterface::DialogUXState newState) {
ACSDK_DEBUG0(LX(__func__).d("newState", newState));
if (!m_initialDialogUXStateReceived) {
// The initial dialog UX state change call comes from simply registering as an observer; it is not a deliberate
// change to the dialog state which should interrupt a recognize event.
m_initialDialogUXStateReceived = true;
return;
}
if (newState != DialogUXStateObserverInterface::DialogUXState::IDLE) {
return;
}
executeResetState();
}
void AudioInputProcessor::setState(ObserverInterface::State state) {
if (m_state == state) {
return;
}
// Reset the user inactivity if transitioning to or from `RECOGNIZING` state.
if (ObserverInterface::State::RECOGNIZING == m_state || ObserverInterface::State::RECOGNIZING == state) {
m_executor.submit([this]() { m_userInactivityMonitor->onUserActive(); });
}
auto currentDialogRequestId =
m_preCachedDialogRequestId.empty() ? m_directiveSequencer->getDialogRequestId() : m_preCachedDialogRequestId;
ACSDK_DEBUG5(LX(__func__).d("currentDialogRequestId", currentDialogRequestId));
if (!currentDialogRequestId.empty()) {
submitInstanceEntryMetric(
m_metricRecorder,
currentDialogRequestId,
ENTRY_METRIC_NAME_STATE_CHANGE,
std::map<std::string, std::string>{{"from", ObserverInterface::stateToString(m_state)},
{"to", ObserverInterface::stateToString(state)}});
}
ACSDK_DEBUG(LX("setState").d("from", m_state).d("to", state));
m_state = state;
managePowerResource(m_state);
for (auto observer : m_observers) {
observer->onStateChanged(m_state);
}
}
void AudioInputProcessor::removeDirective(std::shared_ptr<DirectiveInfo> info) {
// Check result too, to catch cases where DirectiveInfo was created locally, without a nullptr result.
// In those cases there is no messageId to remove because no result was expected.
if (info->directive && info->result) {
CapabilityAgent::removeDirective(info->directive->getMessageId());
}
}
void AudioInputProcessor::sendRequestNow() {
ACSDK_DEBUG(LX(__func__));
if (m_recognizeRequest) {
ACSDK_METRIC_IDS(TAG, "Recognize", "", "", Metrics::Location::AIP_SEND);
if (m_recognizeRequestSent && (m_recognizeRequestSent != m_recognizeRequest)) {
m_recognizeRequestSent->removeObserver(shared_from_this());
}
m_recognizeRequestSent = m_recognizeRequest;
m_recognizeRequestSent->addObserver(shared_from_this());
m_messageSender->sendMessage(m_recognizeRequestSent);
m_recognizeRequest.reset();
m_preparingToSend = false;
if (m_deferredStopCapture) {
m_deferredStopCapture();
m_deferredStopCapture = nullptr;
}
}
}
void AudioInputProcessor::onResponseStatusReceived(
avsCommon::sdkInterfaces::MessageRequestObserverInterface::Status status) {
ACSDK_DEBUG(LX("onResponseStatusReceived").d("status", status));
if (status != MessageRequestObserverInterface::Status::SUCCESS) {
resetState();
}
}
void AudioInputProcessor::onExceptionReceived(const std::string& exceptionMessage) {
ACSDK_ERROR(LX("onExceptionReceived").d("exception", exceptionMessage));
resetState();
}
void AudioInputProcessor::onSendCompleted(MessageRequestObserverInterface::Status status) {
ACSDK_DEBUG(LX("onSendCompleted").d("status", status));
m_executor.submit([this, status]() {
if (MessageRequestObserverInterface::Status::SUCCESS == status &&
ObserverInterface::State::RECOGNIZING == m_state) {
// This is to take care of the edge case where the event stream is closed before a stop capture is received.
m_streamIsClosedInRecognizingState = true;
}
executeResetState();
});
}
std::unordered_set<std::shared_ptr<avsCommon::avs::CapabilityConfiguration>> AudioInputProcessor::
getCapabilityConfigurations() {
std::lock_guard<std::mutex> lock(m_mutex);
return m_capabilityConfigurations;
}
void AudioInputProcessor::onLocaleAssetsChanged() {
m_executor.submit([this]() { executeOnLocaleAssetsChanged(); });
}
void AudioInputProcessor::executeOnLocaleAssetsChanged() {
ACSDK_DEBUG(LX(__func__));
std::unique_lock<std::mutex> lock(m_mutex);
m_capabilityConfigurations.clear();
auto newWakeWordsConfiguration = getSpeechRecognizerCapabilityConfiguration(*m_assetsManager);
m_capabilityConfigurations.insert(newWakeWordsConfiguration);
lock.unlock();
m_capabilityChangeNotifier->notifyObservers(
[newWakeWordsConfiguration](
std::shared_ptr<avsCommon::sdkInterfaces::CapabilityConfigurationChangeObserverInterface> observer) {
observer->onConfigurationChanged(*newWakeWordsConfiguration);
});
}
void AudioInputProcessor::handleDirectiveFailure(
const std::string& errorMessage,
std::shared_ptr<DirectiveInfo> info,
avsCommon::avs::ExceptionErrorType errorType) {
m_exceptionEncounteredSender->sendExceptionEncountered(
info->directive->getUnparsedDirective(), errorType, errorMessage);
if (info->result) {
info->result->setFailed(errorMessage);
}
removeDirective(info);
ACSDK_ERROR(LX("handleDirectiveFailed").d("error", errorMessage));
}
settings::SettingEventMetadata AudioInputProcessor::getWakeWordConfirmationMetadata() {
return settings::SettingEventMetadata{NAMESPACE,
WAKE_WORD_CONFIRMATION_CHANGED_EVENT_NAME,
WAKE_WORD_CONFIRMATION_REPORT_EVENT_NAME,
WAKE_WORD_CONFIRMATION_PAYLOAD_KEY};
}
settings::SettingEventMetadata AudioInputProcessor::getSpeechConfirmationMetadata() {
return settings::SettingEventMetadata{NAMESPACE,
SPEECH_CONFIRMATION_CHANGED_EVENT_NAME,
SPEECH_CONFIRMATION_REPORT_EVENT_NAME,
SPEECH_CONFIRMATION_PAYLOAD_KEY};
}
bool AudioInputProcessor::handleSetWakeWordConfirmation(std::shared_ptr<DirectiveInfo> info) {
std::string jsonValue;
bool found = avsCommon::utils::json::jsonUtils::retrieveValue(
info->directive->getPayload(), WAKE_WORD_CONFIRMATION_PAYLOAD_KEY, &jsonValue);
if (!found) {
std::string errorMessage = "missing " + WAKE_WORD_CONFIRMATION_PAYLOAD_KEY;
sendExceptionEncounteredAndReportFailed(
info, errorMessage, avsCommon::avs::ExceptionErrorType::UNEXPECTED_INFORMATION_RECEIVED);
return false;
}
settings::WakeWordConfirmationSettingType value = settings::getWakeWordConfirmationDefault();
std::stringstream ss{jsonValue};
ss >> value;
if (ss.fail()) {
std::string errorMessage = "invalid " + WAKE_WORD_CONFIRMATION_PAYLOAD_KEY;
sendExceptionEncounteredAndReportFailed(
info, errorMessage, avsCommon::avs::ExceptionErrorType::UNEXPECTED_INFORMATION_RECEIVED);
return false;
}
auto executeChange = [this, value]() { m_wakeWordConfirmation->setAvsChange(value); };
m_executor.submit(executeChange);
if (info->result) {
info->result->setCompleted();
}
removeDirective(info);
return true;
}
bool AudioInputProcessor::handleSetSpeechConfirmation(std::shared_ptr<DirectiveInfo> info) {
std::string jsonValue;
bool found = avsCommon::utils::json::jsonUtils::retrieveValue(
info->directive->getPayload(), SPEECH_CONFIRMATION_PAYLOAD_KEY, &jsonValue);
if (!found) {
std::string errorMessage = "missing/invalid " + SPEECH_CONFIRMATION_PAYLOAD_KEY;
sendExceptionEncounteredAndReportFailed(
info, errorMessage, avsCommon::avs::ExceptionErrorType::UNEXPECTED_INFORMATION_RECEIVED);
return false;
}
settings::SpeechConfirmationSettingType value;
std::stringstream ss{jsonValue};
ss >> value;
if (ss.fail()) {
std::string errorMessage = "invalid " + SPEECH_CONFIRMATION_PAYLOAD_KEY;
sendExceptionEncounteredAndReportFailed(
info, errorMessage, avsCommon::avs::ExceptionErrorType::UNEXPECTED_INFORMATION_RECEIVED);
return false;
}
auto executeChange = [this, value]() { m_speechConfirmation->setAvsChange(value); };
m_executor.submit(executeChange);
if (info->result) {
info->result->setCompleted();
}
removeDirective(info);
return true;
}
bool AudioInputProcessor::handleSetWakeWords(std::shared_ptr<DirectiveInfo> info) {
auto wakeWords = json::jsonUtils::retrieveStringArray<settings::WakeWords>(
info->directive->getPayload(), WAKE_WORDS_PAYLOAD_KEY);
if (wakeWords.empty()) {
std::string errorMessage = "missing/invalid " + WAKE_WORDS_PAYLOAD_KEY;
sendExceptionEncounteredAndReportFailed(
info, errorMessage, avsCommon::avs::ExceptionErrorType::UNEXPECTED_INFORMATION_RECEIVED);
return false;
}
if (!m_wakeWordsSetting) {
std::string errorMessage = "Wake words are not supported in this device";
sendExceptionEncounteredAndReportFailed(
info, errorMessage, avsCommon::avs::ExceptionErrorType::UNSUPPORTED_OPERATION);
return false;
}
m_executor.submit([this, wakeWords, info]() { m_wakeWordsSetting->setAvsChange(wakeWords); });
if (info->result) {
info->result->setCompleted();
}
removeDirective(info);
return true;
}
void AudioInputProcessor::managePowerResource(ObserverInterface::State newState) {
if (!m_powerResourceId) {
return;
}
auto startTime = steady_clock::now();
ACSDK_DEBUG5(LX(__func__).d("state", newState));
switch (newState) {
case ObserverInterface::State::RECOGNIZING:
case ObserverInterface::State::EXPECTING_SPEECH:
m_powerResourceManager->acquire(m_powerResourceId);
m_timeSinceLastResumeMS = m_powerResourceManager->getTimeSinceLastResumeMS();
m_timeSinceLastPartialMS =
m_powerResourceManager->getTimeSinceLastPartialMS(POWER_RESOURCE_COMPONENT_NAME, m_resourceFlags);
submitMetric(
m_metricRecorder,
MetricEventBuilder{}
.setActivityName(ACQUIRE_POWER_RESOURCE_ACTIVITY)
.addDataPoint(DataPointDurationBuilder{duration_cast<milliseconds>(steady_clock::now() - startTime)}
.setName(ACQUIRE_POWER_RESOURCE)
.build()),
m_preCachedDialogRequestId);
break;
case ObserverInterface::State::BUSY:
case ObserverInterface::State::IDLE:
m_powerResourceManager->release(m_powerResourceId);
submitMetric(
m_metricRecorder,
MetricEventBuilder{}
.setActivityName(RELEASE_POWER_RESOURCE_ACTIVITY)
.addDataPoint(DataPointDurationBuilder{duration_cast<milliseconds>(steady_clock::now() - startTime)}
.setName(RELEASE_POWER_RESOURCE)
.build()),
m_preCachedDialogRequestId);
break;
}
}
void AudioInputProcessor::onConnectionStatusChanged(bool connected) {
if (!connected) {
m_executor.submit([this]() { return executeDisconnected(); });
}
}
void AudioInputProcessor::executeDisconnected() {
if (ObserverInterface::State::IDLE != m_state) {
ACSDK_WARN(LX(__func__).d("state", AudioInputProcessorObserverInterface::stateToString(m_state)));
executeResetState();
}
}
bool AudioInputProcessor::setEncodingAudioFormat(AudioFormat::Encoding encoding) {
if (encoding == AudioFormat::Encoding::LPCM ||
(m_encoder && m_encoder->getContext() && encoding == m_encoder->getContext()->getAudioFormat().encoding)) {
std::lock_guard<std::mutex> lock(m_encodingFormatMutex);
m_encodingAudioFormats.clear();
// Only one format is configured, and AIP will send resolved RequestMessage, and this resolveKey is simply a
// placeholder
m_encodingAudioFormats.emplace(DEFAULT_RESOLVE_KEY, encoding);
return true;
}
return false;
}
bool AudioInputProcessor::initialize() {
if (m_encoder && m_encoder->getContext()) {
std::lock_guard<std::mutex> lock(m_encodingFormatMutex);
m_encodingAudioFormats.clear();
m_encodingAudioFormats.emplace(DEFAULT_RESOLVE_KEY, m_encoder->getContext()->getAudioFormat().encoding);
}
m_assetsManager->addLocaleAssetsObserver(shared_from_this());
return true;
}
MessageRequest::MessageRequestResolveFunction AudioInputProcessor::getMessageRequestResolverLocked() const {
auto encodingAudioFormats = m_encodingAudioFormats;
auto attachmentReaders = m_attachmentReaders;
auto metricRecorder = m_metricRecorder;
MessageRequest::MessageRequestResolveFunction resolver = [encodingAudioFormats, attachmentReaders, metricRecorder](
const std::shared_ptr<EditableMessageRequest>& req,
std::string resolveKey) {
return resolveMessageRequest(req, resolveKey, encodingAudioFormats, attachmentReaders, metricRecorder);
};
return resolver;
}
AudioInputProcessor::EncodingFormatResponse AudioInputProcessor::requestEncodingAudioFormats(
const EncodingFormatRequest& encodings) {
EncodingFormatResponse result;
for (auto it = encodings.begin(); it != encodings.end(); it++) {
auto requestedEncoding = it->second.first;
auto fallbackEncoding = it->second.second;
if (isEncodingFormatSupported(requestedEncoding)) {
result[it->first] = requestedEncoding;
} else if (isEncodingFormatSupported(fallbackEncoding)) {
result[it->first] = fallbackEncoding;
} else {
ACSDK_WARN(LX("Both provided requested and fallback encoding formats are not supported.")
.d("resolveKey", it->first)
.d("requestedEncoding", encodingFormatToString(requestedEncoding))
.d("fallbackEncoding", encodingFormatToString(fallbackEncoding)));
}
}
if (!result.empty()) {
std::lock_guard<std::mutex> lock(m_encodingFormatMutex);
m_encodingAudioFormats = result;
} else {
ACSDK_ERROR(LX("None of requested encoding audio formats are supported."));
}
return result;
}
void AudioInputProcessor::closeAttachmentReaders(attachment::AttachmentReader::ClosePoint closePoint) {
for (auto& it : m_attachmentReaders) {
for (auto& namedReader : it.second) {
if (namedReader && namedReader->reader) {
namedReader->reader->close(closePoint);
}
}
}
m_attachmentReaders.clear();
}
bool AudioInputProcessor::isEncodingFormatSupported(avsCommon::utils::AudioFormat::Encoding encodingFormat) const {
if (encodingFormat == AudioFormat::Encoding::LPCM ||
(m_encoder && m_encoder->getContext() &&
encodingFormat == m_encoder->getContext()->getAudioFormat().encoding)) {
return true;
}
return false;
}
bool AudioInputProcessor::isUsingEncoderLocked() const {
for (auto it = m_encodingAudioFormats.begin(); it != m_encodingAudioFormats.end(); it++) {
if (it->second != AudioFormat::Encoding::LPCM) {
return true;
}
}
return false;
}
bool AudioInputProcessor::multiStreamsRequestedLocked() const {
return m_encodingAudioFormats.size() > 1;
}
AudioInputProcessor::EncodingFormatResponse AudioInputProcessor::getEncodingAudioFormats() const {
std::lock_guard<std::mutex> lock(m_encodingFormatMutex);
return m_encodingAudioFormats;
}
} // namespace aip
} // namespace capabilityAgents
} // namespace alexaClientSDK