Text To Speech（文本转语音）-526互联

项目简介

项目中有一部分需要将文本文字进行语音播放，但在网络上查询了很多，发现很多都要注册或者压根就不能用。

这时，我考虑自己写一个文本语音播报软件，既可以根据自定义化，还能提高编码水平。

项目实现

由于使用Windows 10系统，官方语音库肯定是最适配的。库文件包括：#include "sapi.h" and #include "sphelper.h"

除了函数库，还需要准备参数输入库文件和本地语音token

参数输入库使用 getopt.h (从Linux移植至Windows)

windows默认本地语音token一般有如下2个，注册表搜索：HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens

自定义功能：

调节朗读音量
调节朗读速率
指定朗读文本
指定朗读循环次数
帮助信息

text_to_speech.cpp 部分内容，详细请查看github

#include "windows.h"
#include <string>
#include <iostream>
#include "getopt.h"
#include "sapi.h"
#include "sphelper.h"
#pragma comment(lib, "sapi.lib")

using namespace std;

int main(int argc, char *argv[])
{
	int rc = 0;
	VOICE_OPTS voice_opts;
	memset(&voice_opts, 0, sizeof(VOICE_OPTS));
	/* VOICE_OPTS 变量初始化 */
	lib_tts_opt_init(&voice_opts);
	/* 命令行参数读取 */
	static struct option long_options[] = {
	{"help",    0, 0,  '?'},
	{"volume",  1, 0,  'v'},
	{"rate",    1, 0,  'r'},
	{"forTime", 1, 0,  'f'},
	{"text",    1, 0,  't'},
	{0 , 0, 0, 0}
	};
	if ((rc = lib_tts_opts_get(argc, argv, long_options, &voice_opts)) != 0)
	{
		return -1;
	}
	
	const char *str = voice_opts.text;
	wchar_t *p_wchar = char_to_wchar(str);

	::CoInitialize(NULL); // COM初始化
	CLSID CLSID_SpVoice;
	CLSIDFromProgID(L"SAPI.SpVoice", &CLSID_SpVoice);
	ISpVoice *pSpVoice = NULL;
	IEnumSpObjectTokens *pSpEnumTokens = NULL;

	// 获取ISpVoice接口
	if (FAILED(CoCreateInstance(CLSID_SpVoice, NULL, CLSCTX_INPROC_SERVER, IID_ISpVoice, (void**)&pSpVoice)))
	{
		cout << "error:获取ISpVoice接口失败" << endl;
		return -1;
	}
	/* 调节朗读音量 */
	pSpVoice->SetVolume(voice_opts.volume);
	/* 调节朗读速度 */
	pSpVoice->SetRate(voice_opts.rate);
	/* 设置同步朗读超时事件，单位：ms */
	pSpVoice->SetSyncSpeakTimeout(5000);

	// 列举所有的语音token，可以通过pSpEnumTokens指向的接口得到
	if (SUCCEEDED(SpEnumTokens(SPCAT_VOICES, NULL, NULL, &pSpEnumTokens)))
	{
		ULONG count = 0;
		pSpEnumTokens->GetCount(&count);
		// 判断本地语音token数量是否至少有1个
		if (count >= 1)
		{
			ISpObjectToken *pSpToken = NULL;
			pSpEnumTokens->Item(0, &pSpToken);
			pSpVoice->SetVoice(pSpToken); // 设置当前语音token为pSpToken
			for (int i = 0; i < voice_opts.forTime; i++)
				pSpVoice->Speak((LPCWSTR)p_wchar, SPF_DEFAULT, NULL); // 朗读中文和英文的混合字符串
			pSpToken->Release(); // 释放token
		}
		pSpEnumTokens->Release();        // 释放pSpEnumTokens接口
	}

	delete[] p_wchar;
	pSpVoice->Release();
	::CoUninitialize();

	return 0;
}

参考引用

忘记了 getopt.h 的来源

神器text-to-speech语音edge-tts

语音文本speech text

text-to-speech

speech-to-text

speech text text-to-speech speech-to-text

speech text speech-to-text text-to-speech