volcvoice

package module
v0.1.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 12, 2024 License: MIT Imports: 19 Imported by: 0

README

volcvoice

A Go Library for Volcengine (Bytedance) Voice Service

Notes

PCM output format

16-bit signed integer, little-endian, mono, default to 24k sample rate.

Example ffmpeg command:

ffmpeg -f s16le -ar 24k -ac 1 -i test.pcm test.mp3

Usages

Create Client

// using VOLCVOICE_APPID and VOLCVOICE_TOKEN
client := volcvoice.NewClient()

// explicit set appid and token
client := volcvoice.NewClient(
    volcvoice.WithAppID("your_app_id"),
    volcvoice.WithToken("your_token"),
)

Voice Clone Upload

service := client.VoiceCloneUpload().
	SetSpeakerID(os.Getenv("VOLCVOICE_SPEAKER_ID")).
	AddAudio(buf, FormatMP3, "").
	SetModelType(VoiceCloneUploadModelTypeV2)

err := service.Do(context.Background())

Stream Synthesize

service := client.StreamSynthesize().
	SetInput(`“水何澹澹,山岛竦峙。树木丛生,百草丰茂。秋风萧瑟,洪波涌起”是实写眼前的景观,神奇而又壮观。“水何澹澹,山岛竦峙”是望海初得的大致印象,有点像绘画的轮廓。`).
	SetFormat(FormatPCM).
	SetRequestID(requestId.String()).
	SetCluster(StreamSynthesizeClusterV2).
	SetUserID("test").
	SetSpeakerID(os.Getenv("VOLCVOICE_SPEAKER_ID")).
	SetOutput(func(ctx context.Context, buf []byte) (err error) {
		t.Logf("len(buf): %d", len(buf))
		_, err = f.Write(buf)
		return
	})

err := service.Do(context.Background())

Bi-directional Stream Synthesize

var (
	input = []string{
		"离离原上草,一岁一枯荣。",
		"野火烧不尽,春风吹又生。",
		"远芳侵古道,晴翠接荒城。",
		"又送王孙去,萋萋满别情。",
	}
	inputIdx int64 = -1
)

service := client.DuplexSynthesize().
	SetResourceID(SynthesizeResourceVoiceClone2).
	SetRequestID(rg.Must(uuid.NewV7()).String()).
	SetConnectID(rg.Must(uuid.NewV7()).String()).
	SetFormat(AudioFormatPCM).
	SetSampleRate(SampleRate16K).
	SetSpeakerID(os.Getenv("VOLCVOICE_SPEAKER_ID")).
	SetInput(func(ctx context.Context) (chunk string, err error) {
		idx := atomic.AddInt64(&inputIdx, 1)
		if idx >= int64(len(input)) {
			err = io.EOF
			return
		}
		time.Sleep(400 * time.Millisecond)
		chunk = input[idx]
		return
	}).
	SetOutput(func(ctx context.Context, chunk []byte) (err error) {
		_, err = f.Write(chunk)
		return
	}).
	SetUserID("test-user")

err := service.Do(context.Background())

Credits

GUO YANKE, MIT License

Documentation

Index

Constants

View Source
const (
	FormatAAC      = "aac"
	FormatM4A      = "m4a"
	FormatMP3      = "mp3"
	FormatOGG      = "ogg"
	FormatOGG_OPUS = "ogg_opus"
	FormatPCM      = "pcm"
	FormatWAV      = "wav"

	SampleRate8K  = 8000
	SampleRate16K = 16000
	SampleRate24K = 24000
	SampleRate32K = 32000
	SampleRate44K = 44100
	SampleRate48K = 48000
)
View Source
const (
	// DuplexSynthesizeResourceStandard is a resource id for TTS service.
	DuplexSynthesizeResourceStandard = "volc.service_type.10029"

	// DuplexSynthesizeResourceVoiceCloneV2 is a resource id for VoiceClone 2.0 service.
	DuplexSynthesizeResourceVoiceCloneV2 = "volc.megatts.default"
)
View Source
const (
	StreamSynthesizeClusterV1           = "volcano_mega"
	StreamSynthesizeClusterV1Concurrent = "volcano_mega_concurr"
	StreamSynthesizeClusterV2           = "volcano_icl"
	StreamSynthesizeClusterV2Concurrent = "volcano_icl_concurr"
)
View Source
const (
	VoiceCloneUploadLanguageCN = 0
	VoiceCloneUploadLanguageEN = 1
	VoiceCloneUploadLanguageJA = 2
	VoiceCloneUploadLanguageES = 3
	VoiceCloneUploadLanguageID = 4
	VoiceCloneUploadLanguagePT = 5

	VoiceCloneUploadModelTypeV1 = 0
	VoiceCloneUploadModelTypeV2 = 1
)

Variables

This section is empty.

Functions

This section is empty.

Types

type Client

type Client interface {
	// StreamSynthesize create a new service for stream synthesize.
	StreamSynthesize() *StreamSynthesizeService

	// DuplexSynthesize create a new service for bidirectional stream synthesize.
	DuplexSynthesize() *DuplexSynthesizeService

	// VoiceCloneUpload create a new service for voice clone upload.
	VoiceCloneUpload() *VoiceCloneUploadService
}

Client is the interface for the volcvoice client.

func NewClient

func NewClient(fns ...Option) (Client, error)

NewClient creates a new client with the given options.

type DuplexSynthesizeService

type DuplexSynthesizeService struct {
	// contains filtered or unexported fields
}

DuplexSynthesizeService is a service to synthesize speech in bi-directional stream mode.

func (*DuplexSynthesizeService) Do

func (s *DuplexSynthesizeService) Do(ctx context.Context) (err error)

func (*DuplexSynthesizeService) SetConnectID

SetConnectID sets the connect id

func (*DuplexSynthesizeService) SetFormat

SetFormat sets the format of the synthesized speech

func (*DuplexSynthesizeService) SetInput

SetInputFunc sets the input function

func (*DuplexSynthesizeService) SetOutput

SetOutputFunc sets the output function

func (*DuplexSynthesizeService) SetPitchRate

func (s *DuplexSynthesizeService) SetPitchRate(rate int) *DuplexSynthesizeService

SetPitchRate sets the pitch rate of the synthesized speech

func (*DuplexSynthesizeService) SetRequestID

SetRequestID sets the request id

func (*DuplexSynthesizeService) SetResourceID

SetResourceID sets the resource id

func (*DuplexSynthesizeService) SetSSML

SetSSML sets the ssml mode

func (*DuplexSynthesizeService) SetSampleRate

func (s *DuplexSynthesizeService) SetSampleRate(rate int) *DuplexSynthesizeService

SetSampleRate sets the sample rate of the synthesized speech

func (*DuplexSynthesizeService) SetSpeakerID

SetSpeakerID sets the speaker id, for VoiceClone service, use the "S_" started speaker id.

func (*DuplexSynthesizeService) SetSpeechRate

func (s *DuplexSynthesizeService) SetSpeechRate(rate int) *DuplexSynthesizeService

SetSpeechRate sets the speech rate of the synthesized speech

func (*DuplexSynthesizeService) SetUserID

SetUserID sets the user id

type Option

type Option func(opts *options)

func WithAppID

func WithAppID(appID string) Option

WithAppID sets the appID for the client, default to VOLCVOICE_APPID env.

func WithEndpoint

func WithEndpoint(endpoint string) Option

WithEndpoint sets a custom endpoint for the client, default to openspeech.bytedance.com, also can be set by VOLCVOICE_ENDPOINT env.

func WithHTTPClient

func WithHTTPClient(client *http.Client) Option

WithHTTPClient sets a custom http client for the client.

func WithToken

func WithToken(token string) Option

WithToken sets the token for the client, default to VOLCVOICE_TOKEN env.

func WithVerbose

func WithVerbose(verbose bool) Option

WithVerbose enables verbose mode for the client, default to VOLCVOICE_VERBOSE env.

func WithWebsocketDialer

func WithWebsocketDialer(dialer *websocket.Dialer) Option

WithWebsocketDialer sets a custom websocket dialer for the client.

type StreamSynthesizeInput

type StreamSynthesizeInput func(ctx context.Context) (chunk string, err error)

StreamSynthesizeInput is a function to get input text chunk, the last chunk should be empty string and io.EOF error.

func StreamSynthesizeInputFromChannel added in v0.1.1

func StreamSynthesizeInputFromChannel(input chan string) StreamSynthesizeInput

func StreamSynthesizeInputFromSlice added in v0.1.1

func StreamSynthesizeInputFromSlice(input []string) StreamSynthesizeInput

StreamSynthesizeOutput is a function to handle output audio chunk.

type StreamSynthesizeOutput

type StreamSynthesizeOutput func(ctx context.Context, chunk []byte) (err error)

StreamSynthesizeOutput is a function to process output audio chunk.

type StreamSynthesizeService

type StreamSynthesizeService struct {
	// contains filtered or unexported fields
}

StreamSynthesizeService is the service for voice clone.

func (*StreamSynthesizeService) Do

func (s *StreamSynthesizeService) Do(ctx context.Context) (err error)

Do sends the audio request to the server, and stream audio chunks to handler.

func (*StreamSynthesizeService) SetCluster

func (s *StreamSynthesizeService) SetCluster(cluster string) *StreamSynthesizeService

SetCluster sets the cluster for the audio.

func (*StreamSynthesizeService) SetFormat

SetFormat sets the encoding for the audio.

func (*StreamSynthesizeService) SetInput

SetInput sets the text for the audio.

func (*StreamSynthesizeService) SetOutput

SetOutput sets the handler for the audio chunks.

func (*StreamSynthesizeService) SetRequestID

func (s *StreamSynthesizeService) SetRequestID(reqID string) *StreamSynthesizeService

SetRequestID sets the request id for the audio.

func (*StreamSynthesizeService) SetSSML

SetSSML sets the text type to SSML.

func (*StreamSynthesizeService) SetSpeakerID

func (s *StreamSynthesizeService) SetSpeakerID(speakerID string) *StreamSynthesizeService

SetSpeakerID sets the voice type for the audio, also known as the speaker id.

func (*StreamSynthesizeService) SetUserID

SetUserID sets the user id for the audio.

type VoiceCloneUploadAudio

type VoiceCloneUploadAudio struct {
	AudioBytes  string `json:"audio_bytes"`
	AudioFormat string `json:"audio_format,omitempty"`
	Text        string `json:"text,omitempty"`
}

type VoiceCloneUploadResponse

type VoiceCloneUploadResponse struct {
	BaseResp struct {
		StatusCode    int    `json:"StatusCode"`
		StatusMessage string `json:"StatusMessage"`
	} `json:"BaseResp"`
	SpeakerID string `json:"speaker_id"`
}

type VoiceCloneUploadService

type VoiceCloneUploadService struct {
	// contains filtered or unexported fields
}

func (*VoiceCloneUploadService) AddAudio

func (s *VoiceCloneUploadService) AddAudio(buf []byte, format string, text string) *VoiceCloneUploadService

func (*VoiceCloneUploadService) Do

func (s *VoiceCloneUploadService) Do(ctx context.Context) (err error)

func (*VoiceCloneUploadService) SetLanguage

func (s *VoiceCloneUploadService) SetLanguage(language int) *VoiceCloneUploadService

func (*VoiceCloneUploadService) SetModelType

func (s *VoiceCloneUploadService) SetModelType(modelType int) *VoiceCloneUploadService

func (*VoiceCloneUploadService) SetSpeakerID

func (s *VoiceCloneUploadService) SetSpeakerID(speakerID string) *VoiceCloneUploadService

Directories

Path Synopsis
internal

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL