Repository: thuhcsi/SpeechCraft
Branch: master
Commit: 3ef02110bd7b
Files: 10
Total size: 136.3 KB
Directory structure:
gitextract_tgmekqoo/
├── AutomaticPipeline/
│ ├── AgePreTrainModel.py
│ ├── AutoPipeline.py
│ ├── Clustering.py
│ ├── PitchEnergy.py
│ ├── models/
│ │ └── SECap/
│ │ └── model2.py
│ └── outputs/
│ ├── labels_LJspeech_0.json
│ └── labels_LJspeech_0.scp
├── README.md
├── llama-ft/
│ └── llama_infer.py
└── requirements.yaml
================================================
FILE CONTENTS
================================================
================================================
FILE: AutomaticPipeline/AgePreTrainModel.py
================================================
import numpy as np
import torch
import torch.nn as nn
from transformers.models.wav2vec2.modeling_wav2vec2 import (
Wav2Vec2Model,
Wav2Vec2PreTrainedModel,
)
from torch.nn import functional as F
class ModelHead(nn.Module):
r"""Classification head."""
def __init__(self, config, num_labels):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.final_dropout)
self.out_proj = nn.Linear(config.hidden_size, num_labels)
def forward(self, features, **kwargs):
x = features
x = self.dropout(x)
x = self.dense(x)
x = torch.tanh(x)
x = self.dropout(x)
x = self.out_proj(x)
return x
class AgeGenderModel(Wav2Vec2PreTrainedModel):
r"""Speech emotion classifier."""
def __init__(self, config):
super().__init__(config)
self.config = config
self.wav2vec2 = Wav2Vec2Model(config)
self.age = ModelHead(config, 1)
self.init_weights()
def forward(
self,
input_values,
):
outputs = self.wav2vec2(input_values)
hidden_states = outputs[0]
hidden_states = torch.mean(hidden_states, dim=1)
logits_age = self.age(hidden_states)
return logits_age
================================================
FILE: AutomaticPipeline/AutoPipeline.py
================================================
# coding=utf-8
import os
import argparse
import numpy as np
import torch
import librosa
from typing import List, Optional, Union, Dict
from tqdm import tqdm
import torchaudio
from torch.utils.data import DataLoader
from torch.nn import functional as F
from transformers import (
AutoModelForAudioClassification,
Wav2Vec2Processor,
LlamaTokenizer,
AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
)
from funasr import AutoModel
import sys
sys.path.append('../SECap') # path to the directory of SECap
from AgePreTrainModel import AgeGenderModel
from model2 import MotionAudio
from PitchEnergy import process_audio
from g2p_en import G2p
torch.multiprocessing.set_start_method('spawn', force=True)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float32
def to_device(tensors, device):
tensors_to_device = []
for tensor in tensors:
if isinstance(tensor, torch.Tensor):
tensors_to_device.append(tensor.to(device))
else:
tensors_to_device.append(tensor)
return tensors_to_device
class CustomDataset(torch.utils.data.Dataset):
def __init__(
self,
basedir: Optional[str] = None,
sampling_rate: int = 16000,
max_audio_len: int = 5,
number: int=0,
num_devices: int=4
):
self.num_devices = num_devices
self.number = number
self.basedir = basedir
self.sampling_rate = sampling_rate
self.max_audio_len = max_audio_len
self.dataset = []
self.category = []
self.text_tn = []
self.__preprocess__()
def __preprocess__(self):
paths = []
for dirname, subdirs, files in os.walk(self.basedir):
for filename in files:
if filename.startswith('.'):
continue
if filename.endswith('.wav'):
path = os.path.join(dirname, filename)
paths.append(path)
subset_size = len(paths) // self.num_devices
self.dataset = paths[self.number * subset_size: (self.number+1) * subset_size]
def __len__(self):
"""
Return the length of the dataset
"""
return len(self.dataset)
def _cutorpad(self, audio: np.ndarray) -> np.ndarray:
"""
Cut or pad audio to the wished length
"""
effective_length = self.sampling_rate * self.max_audio_len
len_audio = len(audio)
# If audio length is bigger than wished audio length
if len_audio > effective_length:
audio = audio[:effective_length]
elif len_audio < effective_length:
audio_feature_tensor = torch.zeros(1, effective_length)
audio_feature_tensor[:len(audio)] = audio
audio = audio_feature_tensor
# Expand one dimension related to the channel dimension
return audio
def __getitem__(self, index) -> torch.Tensor:
"""
Return the audio and the sampling rate
"""
filepath = self.dataset[index]
speech_array, sr = librosa.load(filepath)
if len(speech_array)==0:
return None
speech_array = speech_array[:self.max_audio_len * sr]
speech_array = torch.Tensor(speech_array).unsqueeze(0)
# Transform to mono
if speech_array.shape[0] > 1:
speech_array = torch.mean(speech_array, dim=0, keepdim=True)
if sr != self.sampling_rate:
transform = torchaudio.transforms.Resample(sr, self.sampling_rate)
speech_array = transform(speech_array)
speech_array = speech_array.squeeze().numpy()
return speech_array, filepath
class CollateFunc:
def __init__(
self,
w2v_processor: Wav2Vec2Processor,
max_length: Optional[int] = None,
padding: Union[bool, str] = True,
pad_to_multiple_of: Optional[int] = None,
sampling_rate: int = 16000,
):
self.padding = padding
self.w2v_processor = w2v_processor
self.max_length = max_length
self.sampling_rate = sampling_rate
self.pad_to_multiple_of = pad_to_multiple_of
def __call__(self, batch: List):
audios = []
input_features = []
audiopaths = []
durations = []
for audio, audiopath in batch:
audios.append(audio)
audiopaths.append(audiopath)
input_tensor = self.w2v_processor(audio, sampling_rate=self.sampling_rate).input_values
input_tensor = np.squeeze(input_tensor)
input_features.append({"input_values": input_tensor})
durations.append(len(audio) / self.sampling_rate)
batch = self.w2v_processor.pad(
input_features,
padding=self.padding,
max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors="pt",
)
return batch, audiopaths, durations
def age_predict(batch, model, device):
r"""Predict age from raw audio signal."""
audios = batch.to(device)
preds = model(audios)
preds = preds.detach().cpu().numpy()
ages = [int(i*100) for i in preds]
return ages
def gender_predict(batch, model, device):
r"""Predict gender from raw audio signal."""
G = ['female', 'male']
input_values, attention_mask = batch['input_values'].to(device), batch['attention_mask'].to(device)
logits = model(input_values, attention_mask=attention_mask).logits
scores = F.softmax(logits, dim=-1)
pred = torch.argmax(scores, dim=1).cpu().detach().numpy()
genders = [G[pred[i]] for i in range(len(pred))]
return genders
# def emotion_predict(batch, model, device, feature_extractor):
def emotion_predict(audiopaths, model):
emotionlabels = ['angry', 'disgusted', 'fearful', 'happy', 'neutral', 'neutral', 'sad', 'surprised', 'neutral']
results = model.generate(audiopaths, granularity="utterance", extract_embedding=False)
scores = [result['scores'] for result in results]
emotion_indexs = [score.index(max(score)) for score in scores]
emotions = [emotionlabels[emotion_index] for emotion_index in emotion_indexs]
return emotions
def pitch_energy_calculate(input_values):
r"""Predict pitch and energy from raw audio signal."""
pitchs = []
energys = []
input_values = input_values.cpu().detach().numpy()
for audio in input_values:
mean_pitch, mean_energy = process_audio(audio, sr=16000)
pitchs.append(mean_pitch)
energys.append(mean_energy)
return pitchs, energys
def inference_on_device(device, i, num_devices, language, basedir, scp_path):
sampling_rate = 16000
batch_size = 4
gender_model_path = "alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech"
age_model_path = "audeering/wav2vec2-large-robust-24-ft-age-gender"
asr_path = "openai/whisper-medium"
# asr_path = "openai/whisper-large-v3"
scp_path = scp_path[:-4]+'_'+str(i)+'.scp'
# Gender Predict
gender_model = AutoModelForAudioClassification.from_pretrained(
pretrained_model_name_or_path = gender_model_path,
num_labels = 2,
label2id = { "female": 0, "male": 1 },
id2label = { 0: "female", 1: "male" },
)
gender_model.to(device)
gender_model.eval()
# Age Predict
w2v_processor = Wav2Vec2Processor.from_pretrained(age_model_path)
age_model = AgeGenderModel.from_pretrained(age_model_path, use_auth_token=False)
age_model.to(device)
#Emotion Predict
if language == 'english':
emotion_model = AutoModel(model="./models/emotion2vec_base_finetuned", model_revision="v2.0.4")
else:
emotion_model= MotionAudio()
llama_ckpt = "../SECap/weights/models--minlik--chinese-llama-7b-merged/snapshots/1ca4d87576f1fef4d44a949fb65bbe6b96675872"
llama_tokenizer = LlamaTokenizer.from_pretrained(llama_ckpt)
ckpt_path="../SECap/model.ckpt"
torch.cuda.empty_cache()
state_dict = torch.load(ckpt_path, map_location=torch.device('cpu'))
emotion_model.load_state_dict(state_dict, strict=False)
emotion_model.to(device)
g2p_model = G2p()
# ASR
asr_processor = AutoProcessor.from_pretrained(asr_path)
asr_model = AutoModelForSpeechSeq2Seq.from_pretrained(asr_path)
asr_model.to(device)
asr_pipe = pipeline(
"automatic-speech-recognition",
model=asr_model,
tokenizer=asr_processor.tokenizer,
feature_extractor=asr_processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=30,
batch_size=batch_size,
return_timestamps=False,
torch_dtype=torch_dtype,
device=device,
)
inferset = CustomDataset(basedir, sampling_rate = sampling_rate, number = i, num_devices= num_devices)
data_collator = CollateFunc(
w2v_processor=w2v_processor,
padding=True,
sampling_rate=16000,
)
test_dataloader = DataLoader(
dataset=inferset,
batch_size=batch_size,
collate_fn=data_collator,
shuffle=False,
num_workers=0
)
with torch.no_grad():
for s, load_data in enumerate(tqdm(test_dataloader)):
audios, audiopaths, durations = to_device(load_data, device)
audio_features = audios['input_values']
transcripts = [asr_pipe(audio_features.numpy()[i], return_timestamps=False, generate_kwargs={"language": language})["text"] for i in range(len(audiopaths))]
ages = age_predict(audio_features, model=age_model, device=device)
genders = gender_predict(audios, model=gender_model, device=device)
pitchs, energys = pitch_energy_calculate(audio_features)
if language == 'english':
phonemes = [g2p_model(transcripts) for i in range(len(audiopaths))]
speeds = [durations[i] / len(phonemes[i] ) for i in range(len(audiopaths))]
emotions = emotion_predict(audiopaths, model=emotion_model)
else:
speeds = [durations[i] / len(transcripts[i] ) for i in range(len(audiopaths))]
prompts = emotion_model.inference(audio_features.to(device))
emotions = llama_tokenizer.batch_decode(prompts,skip_special_tokens=True)
with open(scp_path, 'a', encoding='utf-8') as file:
for i in range(len(audiopaths)):
file.write(f"{audiopaths[i].split('/')[-2]}\t{audiopaths[i].split('/')[-1][:-4]}\t{ages[i]}\t{genders[i]}\t{pitchs[i]}\t{energys[i]}\t{speeds[i]}\t{emotions[i]}\t{transcripts[i]}\n")
def main(args):
language = args.language
basedir = args.basedir
devices = list(map(int, args.devices.split(',')))
num_devices = len(devices)
scp_path = args.scp_path
processes = []
for i in range(num_devices):
device_num = devices[i]
device = torch.device(f'cuda:{device_num}')
p = torch.multiprocessing.Process(target=inference_on_device, args=(device, i, num_devices, language, basedir, scp_path))
p.start()
processes.append(p)
for p in processes:
p.join()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--language', type=str, default = 'chinese')
parser.add_argument('--devices', type=str, default = '0')
parser.add_argument('--basedir', type=str, default = '../FastSpeech2/raw_data/AISHELL3/SSB0005')
parser.add_argument('--scp_path', type=str, default = './outputs/labels_AISHELL.scp')
args = parser.parse_args()
main(args)
================================================
FILE: AutomaticPipeline/Clustering.py
================================================
import pandas as pd
import csv
import numpy as np
import argparse
import json
def assign_pitch_group(row, language, male_percentiles, female_percentiles):
if row['gender'] == 'male':
if row['pitch'] <= male_percentiles[0]:
return 'low' if language == 'en' else '低'
elif row['pitch'] <= male_percentiles[1]:
return 'normal' if language == 'en' else '中'
else:
return 'high' if language == 'en' else '高'
else:
if row['pitch'] <= female_percentiles[0]:
return 'low' if language == 'en' else '低'
elif row['pitch'] <= female_percentiles[1]:
return 'normal' if language == 'en' else '中'
else:
return 'high' if language == 'en' else '高'
def replace_age_with_text(row, language):
age = row['age']
if age < 14:
return "Child" if language == 'en' else '小孩'
elif age < 26:
return "Teenager" if language == 'en' else '少年'
elif age < 40:
return "Young Adult" if language == 'en' else '青年'
elif age < 55:
return "Middle-aged" if language == 'en' else '中年'
else:
return "Elderly" if language == 'en' else '老年'
def main(args):
input_path = args.input_path
language = args.language
df = pd.read_csv(input_path, encoding = 'utf-8', sep='\t', header=None, quoting=csv.QUOTE_NONE)
df.columns = ['filename1', 'filename2', 'age', 'gender', 'pitch', 'energy', 'speed', 'emotion', 'transcript']
df = df.dropna()
# df = df.dropna(subset=['pitch'])
male_percentiles = np.percentile(df[df['gender'] == 'male']['pitch'], [40, 90])
female_percentiles = np.percentile(df[df['gender'] == 'female']['pitch'], [10, 60])
df['age'] = df.apply(replace_age_with_text, axis=1, language = language)
df['pitch_group'] = df.apply(assign_pitch_group, axis=1, language = language, male_percentiles = male_percentiles, female_percentiles = female_percentiles)
df['energy_group'] = pd.qcut(df['energy'], 3, labels=["low", "normal", "high"] if language=='en' else ["低", "中", "高"])
df['speed_group'] = pd.qcut(df['speed'], 3, labels=["fast", "normal", "slow"] if language=='en' else ["快", "中", "慢"])
df_to_save = df[['filename1', 'filename2', 'age', 'gender', 'pitch_group', 'energy_group', 'speed_group', 'emotion', 'transcript']]
df_to_save.to_csv(input_path.replace('.scp', '_clusterd.scp'), sep='\t', header=0, index=False, quoting=csv.QUOTE_NONE)
result_dict = {}
for index, row in df_to_save.iterrows():
key = f"{row['filename1']}_{row['filename2']}"
if language == 'en':
value = (
f"age:{row['age']}\t"
f"gender:{row['gender']}\t"
f"pitch:{row['pitch_group']}\t"
f"volume:{row['energy_group']}\t"
f"speed:{row['speed_group']}\t"
f"emotion:{row['emotion']}\t"
f"transcription:{row['transcript']}"
)
else:
value = (
f"语气:{row['emotion']}\t"
f"年龄:{row['age']}\t"
f"性别:{row['gender']}\t"
f"音高:{row['pitch_group']}\t"
f"音量:{row['energy_group']}\t"
f"语速:{row['speed_group']}"
f"文本:{row['transcript']}\t"
)
result_dict[key] = {}
result_dict[key]['labels'] = value
with open(input_path.replace('.scp', '.json'), 'w', encoding='utf-8') as json_file:
json.dump(result_dict, json_file, ensure_ascii=False, indent=4)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--language', type=str, default = 'en')
parser.add_argument('--input_path', type=str, default = './outputs/labels_LJspeech_0.scp')
args = parser.parse_args()
main(args)
================================================
FILE: AutomaticPipeline/PitchEnergy.py
================================================
import librosa
import numpy as np
def extract_pitch(wav, sr):
pitches, magnitudes = librosa.core.piptrack(y=wav, sr=sr)
return pitches
def calculate_mean_pitch(pitches):
return np.mean([np.mean(p[p > 0]) for p in pitches.T if np.sum(p > 0) > 0])
def process_audio(audio, sr):
pitches = extract_pitch(audio, sr = sr)
mean_pitch = calculate_mean_pitch(pitches)
energy = librosa.feature.rms(y=audio)
mean_energy = np.mean(energy[~np.isnan(energy)])
return mean_pitch, mean_energy
================================================
FILE: AutomaticPipeline/models/SECap/model2.py
================================================
import torch
import torch.nn as nn
import lightning.pytorch as pl
from module.Qformer import BertConfig, BertLMHeadModel
from transformers import (
Wav2Vec2FeatureExtractor,
HubertModel,
BertTokenizer,
BertModel,
LlamaTokenizer
)
from module.modeling_llama import LlamaForCausalLM
from CLUB_modules.mi_estimators import *
from tool.get_sentence_simi import SimiCal
import torch.nn.functional as F
from transformers import StoppingCriteria, StoppingCriteriaList
import numpy as np
import os
class KeywordsStoppingCriteria(StoppingCriteria):
def __init__(self, keywords_ids:list):
self.keywords = keywords_ids
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
if input_ids[0][-1] in self.keywords:
return True
return False
class MotionAudio(pl.LightningModule):
def __init__(
self,
hubert_ckpt="weights/models--TencentGameMate--chinese-hubert-large/snapshots/90cb660492214f687e60f5ca509b20edae6e75bd",
text2vec_ckpt="weights/models--shibing624--text2vec-base-chinese/snapshots/26420fdf61ddfd92fafbaf3bc21a7c06b1812248",
llama_ckpt="weights/models--minlik--chinese-llama-7b-merged/snapshots/1ca4d87576f1fef4d44a949fb65bbe6b96675872"):
super(MotionAudio,self).__init__()
#path
current_directory = os.path.dirname(os.path.abspath(__file__))
hubert_ckpt = os.path.join(current_directory, hubert_ckpt)
text2vec_ckpt = os.path.join(current_directory, text2vec_ckpt)
llama_ckpt = os.path.join(current_directory, llama_ckpt)
#hubert
self.hubert_model=HubertModel.from_pretrained(hubert_ckpt)
self.hubert_feature_extractor=Wav2Vec2FeatureExtractor.from_pretrained(hubert_ckpt)
#text2vec
self.text2vec_model=BertModel.from_pretrained(text2vec_ckpt)
self.text2vec_tokenizer=BertTokenizer.from_pretrained(text2vec_ckpt)
#llama
self.llama_model=LlamaForCausalLM.from_pretrained(llama_ckpt, torch_dtype="auto")
#self.llama_model = self.llama_model.to(torch.float32)
self.llama_tokenizer=LlamaTokenizer.from_pretrained(llama_ckpt)
if self.llama_tokenizer.pad_token_id is None:
self.llama_tokenizer.pad_token = self.llama_tokenizer.unk_token
#self.llama_model.model.resize_token_embeddings(len(self.llama_tokenizer))
for p in self.parameters():
p.requires_grad = False
#Qformer
self.audio_Qformer,self.audio_query_tokens=self.init_Qformer(num_query_token=32,vision_width=768)
self.audio_Qformer.cls = None
self.audio_Qformer.bert.embeddings.word_embeddings = None
self.audio_Qformer.bert.embeddings.position_embeddings = None
for layer in self.audio_Qformer.bert.encoder.layer:
layer.output = None
layer.intermediate = None
self.audio_project=nn.Linear(1024,768)
self.audio_llama_project=nn.Linear(768,4096)
def init_Qformer(self,num_query_token, vision_width, cross_attention_freq=2):
path=os.path.dirname(os.path.abspath(__file__))
config_path=os.path.join(path,"weights/models--bert-base-chinese/snapshots/8d2a91f91cc38c96bb8b4556ba70c392f8d5ee55")
encoder_config = BertConfig.from_pretrained(config_path)
encoder_config.encoder_width = vision_width
# insert cross-attention layer every other block
encoder_config.add_cross_attention = True
encoder_config.cross_attention_freq = cross_attention_freq
encoder_config.query_length = num_query_token
Qformer = BertLMHeadModel(config=encoder_config)
ckpt=os.path.join(path,"weights/models--bert-base-chinese/snapshots/8d2a91f91cc38c96bb8b4556ba70c392f8d5ee55/pytorch_model.bin")
Qformer.load_state_dict(torch.load(ckpt),strict=False)
query_tokens = nn.Parameter(
torch.zeros(1, num_query_token, encoder_config.hidden_size)
)
query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range)
return Qformer, query_tokens
def mean_pooling(self,model_output, attention_mask):
token_embeddings = model_output[0] # First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
def forward(self, audio, describtion):
#hubert
with torch.no_grad():
audio_feature=self.hubert_feature_extractor(audio, padding=True,return_tensors="pt",sampling_rate=16000).input_values.to(self.device)
audio_feature = audio_feature.half()
audio_feature=self.hubert_model(audio_feature).last_hidden_state
audio_feature=self.audio_project(audio_feature)
#text2vec
with torch.no_grad():
#describtion
describtion=[s+"</s>" for s in describtion]
describtion_input=self.text2vec_tokenizer(describtion, padding=True, truncation=True, return_tensors='pt').to(self.device)
describtion_feature=self.text2vec_model(**describtion_input)
describtion_feature=self.mean_pooling(describtion_feature,describtion_input['attention_mask']).unsqueeze(1)
#Qformer
audio_query_tokens=self.audio_query_tokens.expand(audio_feature.shape[0], -1, -1)
frame_atts = torch.ones(audio_feature.size()[:-1], dtype=torch.long).to(audio_feature.device)
#print(audio_query_tokens.shape,audio_feature.shape,frame_atts.shape)
audio_query_output=self.audio_Qformer.bert(
query_embeds=audio_query_tokens, #[32,768]
encoder_hidden_states=audio_feature,
encoder_attention_mask=frame_atts,
return_dict=True,
)
audio_hidden=audio_query_output.last_hidden_state
text_tokens=self.llama_tokenizer(describtion, padding="longest", truncation=True, return_tensors='pt',add_special_tokens=False).to(self.device)
#print(audio_hidden.shape)
audio_input=self.audio_llama_project(audio_hidden)
batchsize=audio_input.shape[0]
bos=torch.ones([batchsize, 1],dtype=text_tokens.input_ids.dtype).to(self.device) * self.llama_tokenizer.bos_token_id
bos_embeds=self.llama_model.model.embed_tokens(bos.to(self.device))
#in training, we use different prompts for each audio
prompts=[ "请用一句话用中文表述音频中说话人的情感状态:", "请用一句中文概括音频中讲话者的情感:", "请用一句中文简述音频里说话者的情感表现:", "请用一句中文概述所给音频中说话人的情感:", "请用一句话用中文描述音频中说话人的情感:", "请用一句中文描绘音频中说话者的情感:", "请用一句中文描述所给音频中说话人的情感:", "请用一句中文简要表述音频中说话人的情感:", "请用一句中文概括所给音频中说话者的情感:", "请用一句话用中文描述所给音频中说话人的情感:", "请用一句中文简述所给音频里说话者的情感:", "请用一句中文描述音频中讲话者的情感:", "请用一句中文概述音频中说话人的情感:", "请用一句话用中文表达音频中说话者的情感:", "请用一句中文简要描述音频中说话人的情感:", "请用一句中文概括音频中说话人的情感:", "请用一句中文描述所给音频中讲话者的情感:", "请用一句中文简述音频中说话者的情感:", "请用一句中文概述所给音频中讲话者的情感:", "请用一句话用中文描述音频中讲话者的情感:", "请用一句中文描述音频中说话人的情感状态:", "请用一句中文概括所给音频里说话者的情感:", "请用一句中文简述所给音频中说话人的情感表现:", "请用一句中文概述音频里说话者的情感:", "请用一句话用中文描述音频中说话人的情感表现:", "请用一句中文描绘所给音频中说话者的情感:", "请用一句中文描述音频里讲话者的情感:", "请用一句中文简要表述所给音频中说话人的情感:", "请用一句中文概括音频里说话者的情感:", "请用一句话用中文描述所给音频中讲话者的情感:" ]
import random
prompt=prompts[random.randint(0,len(prompts)-1)]
prompts_id=self.llama_tokenizer(prompt,return_tensors='pt').input_ids.to(self.device)
prompts_id=prompts_id.expand(batchsize,-1)
prompts_embeds=self.llama_model.model.embed_tokens(prompts_id)
targets=text_tokens.input_ids.masked_fill(
text_tokens.input_ids==self.llama_tokenizer.pad_token_id,-100
)
text_embeds=self.llama_model.model.embed_tokens(text_tokens.input_ids.to(self.device))
input_embeds=torch.cat([bos_embeds,audio_input,prompts_embeds,text_embeds],dim=1)
atts_audio=torch.ones(audio_input.size()[:-1], dtype=torch.long).to(audio_input.device)
#atts_audio=atts_audio.to(self.device)
attns_text=text_tokens.attention_mask
attns_bos=atts_audio[:,:1]
attns_prompt=torch.ones(prompts_embeds.size()[:-1], dtype=torch.long).to(prompts_embeds.device)
attns=torch.cat([attns_bos,atts_audio,attns_prompt,attns_text],dim=1)
print(input_embeds.shape,attns.shape,targets.shape)
outputs=self.llama_model(
inputs_embeds=input_embeds,
attention_mask=attns,
labels=targets,
return_dict=True,
)
loss=outputs.loss
#print(loss)
return loss
def training_step(self, batch, batch_idx):
audio, describtion,_=batch
loss=self.forward(audio, describtion)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True,batch_size=len(audio),sync_dist=True)
return loss
def validation_step(self, batch, batch_idx):
audio, describtion,_=batch
loss=self.forward(audio, describtion)
self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True,batch_size=len(audio),sync_dist=True)
return loss
def configure_optimizers(self):
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, self.parameters()), lr=0.000013, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-6)
return optimizer
def inference(self, audio_feature):
with torch.no_grad():
audio_feature=self.hubert_model(audio_feature).last_hidden_state
audio_feature=self.audio_project(audio_feature)
#Qformer
audio_query_tokens=self.audio_query_tokens.expand(audio_feature.shape[0], -1, -1)
frame_atts = torch.ones(audio_feature.size()[:-1], dtype=torch.long).to(audio_feature.device)
audio_query_output=self.audio_Qformer.bert(
query_embeds=audio_query_tokens, #[32,768]
encoder_hidden_states=audio_feature,
encoder_attention_mask=frame_atts,
return_dict=True,
)
audio_hidden=audio_query_output.last_hidden_state
#print(audio_hidden.shape)
audio_input=self.audio_llama_project(audio_hidden)
batchsize=audio_input.shape[0]
#in inference, we use the same prompt for all audio
#prompts=[ "请用一句话用中文表述音频中说话人的情感状态:", "请用一句中文概括音频中讲话者的情感:", "请用一句中文简述音频里说话者的情感表现:", "请用一句中文概述所给音频中说话人的情感:", "请用一句话用中文描述音频中说话人的情感:", "请用一句中文描绘音频中说话者的情感:", "请用一句中文描述所给音频中说话人的情感:", "请用一句中文简要表述音频中说话人的情感:", "请用一句中文概括所给音频中说话者的情感:", "请用一句话用中文描述所给音频中说话人的情感:", "请用一句中文简述所给音频里说话者的情感:", "请用一句中文描述音频中讲话者的情感:", "请用一句中文概述音频中说话人的情感:", "请用一句话用中文表达音频中说话者的情感:", "请用一句中文简要描述音频中说话人的情感:", "请用一句中文概括音频中说话人的情感:", "请用一句中文描述所给音频中讲话者的情感:", "请用一句中文简述音频中说话者的情感:", "请用一句中文概述所给音频中讲话者的情感:", "请用一句话用中文描述音频中讲话者的情感:", "请用一句中文描述音频中说话人的情感状态:", "请用一句中文概括所给音频里说话者的情感:", "请用一句中文简述所给音频中说话人的情感表现:", "请用一句中文概述音频里说话者的情感:", "请用一句话用中文描述音频中说话人的情感表现:", "请用一句中文描绘所给音频中说话者的情感:", "请用一句中文描述音频里讲话者的情感:", "请用一句中文简要表述所给音频中说话人的情感:", "请用一句中文概括音频里说话者的情感:", "请用一句话用中文描述所给音频中讲话者的情感:" ]
prompt="请用一句中文简述音频里说话者的情感表现:"
#import random
#prompt=prompts[random.randint(0,len(prompts)-1)]
prompts_id=self.llama_tokenizer(prompt,return_tensors='pt').input_ids.to(self.device)
prompts_id=prompts_id.expand(batchsize,-1)
prompts_embeds=self.llama_model.model.embed_tokens(prompts_id)
bos=torch.ones([batchsize, 1],dtype=torch.int64).to(self.device) * self.llama_tokenizer.bos_token_id
bos_embeds=self.llama_model.model.embed_tokens(bos.to(self.device))
embeds=torch.cat([bos_embeds,audio_input,prompts_embeds],dim=1)
#print(embeds.dtype)
embeds=embeds.half()
with torch.no_grad():
outputs=self.llama_model.generate(
inputs_embeds=embeds,
max_new_tokens=50,
min_new_tokens=3,
do_sample=True,
top_k=10,
top_p=0.95,
num_beams=5,
repetition_penalty=10.0,
pad_token_id=self.llama_tokenizer.pad_token_id,
eos_token_id=self.llama_tokenizer.eos_token_id,
#stopping_criteria=stopping_criteria,
early_stopping=True,
num_return_sequences=1,
no_repeat_ngram_size=2,
)
# print(output_tokens)
return outputs
def post_processing(self, sentences,device):
similarities = np.zeros((len(sentences), len(sentences)))
simi_cal=SimiCal(device=device)
for i in range(len(sentences)):
for j in range(len(sentences)):
similarities[i, j] = simi_cal(sentences[i], sentences[j])
avg_similarities = np.mean(similarities, axis=1)
least_related_indices=avg_similarities.argsort()[:3]
remaining_sentences = [sentences[i] for i in range(len(sentences)) if i not in least_related_indices]
return remaining_sentences
def test_step(self, batch, batch_idx):
audio,_,describtion,fpath=batch
output_tokens,prompt=self.inference(audio)
path=os.path.dirname(os.path.abspath(__file__))
test_file="result/result_1.txt"
test_file=os.path.join(path,test_file)
with open(test_file,"a",encoding="utf-8") as f:
f.write("file: "+fpath[0]+"\n")
#f.write("prompt: "+prompt+"\n")
f.write("origin: "+describtion[0]+"\n")
f.write("result: "+output_tokens[0]+"\n")
f.write("result2: "+output_tokens[1]+"\n")
f.write("result3: "+output_tokens[2]+"\n")
f.write("result4: "+output_tokens[3]+"\n")
f.write("result5: "+output_tokens[4]+"\n")
f.write("\n")
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
if __name__ == "__main__":
model=MotionAudio()
print(count_parameters(model))
================================================
FILE: AutomaticPipeline/outputs/labels_LJspeech_0.json
================================================
{
"LJSpeech_LJ010-0227": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:fast\temotion:sad\ttranscription: He went from Newgate first to Bethlehem, from which he was removed to Broadmoor."
},
"LJSpeech_LJ050-0106": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: to devise a practical system which has any reasonable possibility of revealing"
},
"LJSpeech_LJ043-0140": {
"labels": "age:Elderly\tgender:male\tpitch:high\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: He also studied Dallas bus schedules to prepare for his later use of"
},
"LJSpeech_LJ018-0032": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:sad\ttranscription: There was no mystery about his departure. He had gone to Canada by the Victoria"
},
"LJSpeech_LJ012-0117": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: By and by the occupant of the room noticed something glittering in the center of the fire."
},
"LJSpeech_LJ038-0057": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: Deputy Sheriff Walther's brought a shotgun into the theater, but laid it on some"
},
"LJSpeech_LJ017-0158": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:fast\temotion:happy\ttranscription: She had a little fortune of her own, some 1,700 pounds"
},
"LJSpeech_LJ026-0094": {
"labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: This however is probably not a source of vital energy, but only contributes to the"
},
"LJSpeech_LJ045-0152": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:happy\ttranscription: The events of that evening can best be appreciated through Marina Oswald's testimony."
},
"LJSpeech_LJ019-0264": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: and it was decidedly of opinion that in all short sentences the hard labor"
},
"LJSpeech_LJ040-0047": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: That associate did not think that Oswald was a communist."
},
"LJSpeech_LJ012-0015": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: Weedon and LaCassar to 12 and 6 months respectively in cold baths."
},
"LJSpeech_LJ019-0097": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:sad\ttranscription: The want of uniformity in prison discipline became air long and acknowledged evil."
},
"LJSpeech_LJ039-0093": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:normal\temotion:neutral\ttranscription: at a distance of 265.3 feet was, quote,"
},
"LJSpeech_LJ018-0028": {
"labels": "age:Elderly\tgender:female\tpitch:high\tvolume:normal\tspeed:normal\temotion:sad\ttranscription: who had been a lodger of his. Muehler had given the cabman's little daughter"
},
"LJSpeech_LJ028-0287": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:normal\temotion:angry\ttranscription: but found none by which he could hope to prevail unless he maimed himself."
},
"LJSpeech_LJ024-0117": {
"labels": "age:Elderly\tgender:female\tpitch:low\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: You will find that many of those who pretend to support you will sabotage your plans."
},
"LJSpeech_LJ029-0172": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: The two Dallas newspapers provided their readers with a steady stream of information and"
},
"LJSpeech_LJ024-0044": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:angry\ttranscription: that I will appoint justices who will act as justices and not as"
},
"LJSpeech_LJ045-0246": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:normal\temotion:sad\ttranscription: He sought for himself a place in history, a role as the great"
},
"LJSpeech_LJ042-0191": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: While, quote, resourcefulness and patient working towards the aforesaid goal"
},
"LJSpeech_LJ050-0099": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: It is apparent that a good deal of further consideration and experimentation"
},
"LJSpeech_LJ017-0101": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:slow\temotion:neutral\ttranscription: That day, Palmer had bought more strychnia and had called in a fresh doctor."
},
"LJSpeech_LJ016-0388": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:happy\ttranscription: A few go further and are almost gluttonous."
},
"LJSpeech_LJ007-0129": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:normal\temotion:angry\ttranscription: the sane and the insane, the young and the old, the trivial offender and the"
},
"LJSpeech_LJ014-0195": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: Two of them supported Cope, who was still alive, although insensible, and Marley"
},
"LJSpeech_LJ027-0056": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:fast\temotion:happy\ttranscription: The relation of skeleton and muscle in arthropods is exactly the reverse."
},
"LJSpeech_LJ030-0227": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: Special Agent George W. Hickey Jr. in the rear seat of the Presidential"
},
"LJSpeech_LJ047-0210": {
"labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: I think all of those, if we had them all together..."
},
"LJSpeech_LJ048-0228": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: and others who were present say that no agent was inebriated or acted improperly."
},
"LJSpeech_LJ009-0104": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: The women set up a yell, which is mixed with a rustling noise, occasioned by the"
},
"LJSpeech_LJ017-0085": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: Palmer's plan was to administer poison in quantities insufficient to"
},
"LJSpeech_LJ018-0275": {
"labels": "age:Elderly\tgender:male\tpitch:high\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: On Tarpey's defense, it was stated that the idea of the theft had been suggested"
},
"LJSpeech_LJ027-0064": {
"labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: All vertebrates and none other have two cavities."
},
"LJSpeech_LJ036-0084": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: Craig also claimed that when Fritz pointed out to Oswald that Craig had identified"
},
"LJSpeech_LJ040-0211": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: This would make him approximately ten, well almost eleven years old."
},
"LJSpeech_LJ032-0241": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:slow\temotion:happy\ttranscription: Ten days prior to the Walker attempt, Oswald had undoubtedly received the rifle."
},
"LJSpeech_LJ002-0158": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: Other cases are recorded elsewhere, as at the Gilsburg Street Comptor, where"
},
"LJSpeech_LJ050-0077": {
"labels": "age:Elderly\tgender:male\tpitch:high\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: and in taking preventive steps."
},
"LJSpeech_LJ032-0236": {
"labels": "age:Elderly\tgender:male\tpitch:high\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: By checking the actual mailing dates of these issues and the time it usually takes"
},
"LJSpeech_LJ026-0102": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:happy\ttranscription: but root pressure due to osmosis, capillary action and evaporation."
},
"LJSpeech_LJ025-0057": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: such as the charai, are in constant and regular motion, was made out"
},
"LJSpeech_LJ018-0270": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: The assistant called with the jewels on approbation at a house specially hired for"
},
"LJSpeech_LJ013-0021": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:slow\temotion:sad\ttranscription: The larboard pump was suffered to remain choked up and the longboat was"
},
"LJSpeech_LJ042-0050": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:sad\ttranscription: I have been a pro-communist for years and yet I have never met a communist."
},
"LJSpeech_LJ006-0075": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:happy\ttranscription: He saw certain rooms fill up, and yet took no steps to open others that were locked"
},
"LJSpeech_LJ048-0187": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: In addition, Secret Service agents riding in the motorcade were trained to"
},
"LJSpeech_LJ022-0186": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:happy\ttranscription: Twenty years of experience with this system have justified the efforts made to"
},
"LJSpeech_LJ013-0078": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: Barbara was subsequently pardoned, but was not replaced on the roles as an attorney."
},
"LJSpeech_LJ006-0015": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: one which required discretion, judgment and knowledge of law, with sufficient"
},
"LJSpeech_LJ048-0061": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: Under proper procedures, knowledge of the pending Presidential visit might have prompted"
},
"LJSpeech_LJ015-0190": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:sad\ttranscription: that of dishonest rogues who assume piety and philanthropy as a cloak for the world."
},
"LJSpeech_LJ003-0115": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: The judge sat in proper form. He was punctiliously styled, my"
},
"LJSpeech_LJ045-0244": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:angry\ttranscription: Long before the assassination, he expressed his hatred for American society."
},
"LJSpeech_LJ004-0051": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:happy\ttranscription: When in Belgium he had examined with great satisfaction the admirable manner"
},
"LJSpeech_LJ011-0265": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:sad\ttranscription: This Mr. Canning had left his widow a life interest in two thousand pounds."
},
"LJSpeech_LJ028-0087": {
"labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: Such was the appearance of the builder of the walls of Babylon."
},
"LJSpeech_LJ028-0178": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:slow\temotion:happy\ttranscription: The walls of Babylon were so long and wide and high that all who"
},
"LJSpeech_LJ011-0113": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:slow\temotion:sad\ttranscription: He met his death with unshaken firmness, only in treating that a certain blue"
},
"LJSpeech_LJ039-0162": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:slow\temotion:sad\ttranscription: In an effort to test the rifle under conditions which simulated those which prevailed during"
},
"LJSpeech_LJ045-0066": {
"labels": "age:Middle-aged\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:sad\ttranscription: Oswald struck his wife on occasion."
},
"LJSpeech_LJ028-0277": {
"labels": "age:Middle-aged\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:surprised\ttranscription: One of his Sumter mules gave birth to a foal."
},
"LJSpeech_LJ048-0197": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: I then told the officers that their primary duty was traffic and crowd control and that"
},
"LJSpeech_LJ019-0170": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: More officers were appointed, as the time of so many of those already on the staff"
},
"LJSpeech_LJ041-0092": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: and quote, individual that you would brainwash and quite easy, but I think"
},
"LJSpeech_LJ039-0027": {
"labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:happy\ttranscription: and switch with Azure."
},
"LJSpeech_LJ030-0077": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: Each agent carried a .38 caliber pistol and a shotgun and"
},
"LJSpeech_LJ038-0112": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:neutral\ttranscription: As previously indicated, Marina Oswald testified that she took the"
},
"LJSpeech_LJ017-0119": {
"labels": "age:Elderly\tgender:male\tpitch:high\tvolume:low\tspeed:fast\temotion:happy\ttranscription: All the circumstances were so suspicious that he could not escape the criminal charge."
},
"LJSpeech_LJ007-0242": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: No complete and permanent improvement was indeed possible while Newgate remained unchanged."
},
"LJSpeech_LJ020-0046": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:happy\ttranscription: Close the dough over it, dust your hands and kneading board with flour,"
},
"LJSpeech_LJ018-0001": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: The Chronicles of Newgate, Volume 2, by Arthur Griffiths. Section 21."
},
"LJSpeech_LJ026-0098": {
"labels": "age:Elderly\tgender:male\tpitch:high\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: The circulatory system distributes these foods in animals,"
},
"LJSpeech_LJ021-0152": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:neutral\ttranscription: and to experiment for a reasonable time with measures suitable to"
},
"LJSpeech_LJ047-0223": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:slow\temotion:neutral\ttranscription: I don't recall the exact date, it was about a week prior.\""
},
"LJSpeech_LJ007-0040": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:happy\ttranscription: In a second room were fourteen more who had every hope of a reprieve."
},
"LJSpeech_LJ050-0103": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:normal\temotion:sad\ttranscription: were all men who acted alone in their criminal acts against our leaders."
},
"LJSpeech_LJ037-0016": {
"labels": "age:Elderly\tgender:male\tpitch:high\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: Scoggins hurriedly left his seat and hid behind the cab, as the man came back to"
},
"LJSpeech_LJ028-0336": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:sad\ttranscription: Darius now, still keeping to the plan agreed upon."
},
"LJSpeech_LJ004-0233": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: Under the new rule, visitors were not allowed to pass into the interior of the prison,"
},
"LJSpeech_LJ024-0127": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:slow\temotion:angry\ttranscription: You who know me can have no fear that I would tolerate the destruction"
},
"LJSpeech_LJ004-0189": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:slow\temotion:happy\ttranscription: The great principles of classification, cleanliness and employment were closely observed."
},
"LJSpeech_LJ005-0045": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: Nor did it confine itself to mere verbal recommendations."
},
"LJSpeech_LJ005-0158": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:happy\ttranscription: Bedding and clothing was still denied, but only in a few jails."
},
"LJSpeech_LJ011-0018": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: The crime, long carried on without detection, was first discovered in 1820."
},
"LJSpeech_LJ017-0022": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:normal\temotion:neutral\ttranscription: was that of Eliza Fenning, who was convicted of an attempt to poison a whole family."
},
"LJSpeech_LJ030-0136": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:normal\temotion:sad\ttranscription: The President replied, that is very obvious."
},
"LJSpeech_LJ039-0078": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: The effect of a four-power telescopic sight on the difficulty of these shots"
},
"LJSpeech_LJ048-0254": {
"labels": "age:Elderly\tgender:male\tpitch:high\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: advised in the course of the Secret Service investigation of these events that each agent"
},
"LJSpeech_LJ049-0083": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:angry\ttranscription: It has long been a federal crime to conspire to injure any federal officer on account of"
},
"LJSpeech_LJ006-0043": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: The disgraceful overcrowding had been partially ended, but the same evils of"
},
"LJSpeech_LJ025-0109": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: has been as completely invalidated as the third and second."
},
"LJSpeech_LJ044-0163": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: Marina Oswald testified that her husband engaged in fair play for Cuba."
},
"LJSpeech_LJ003-0037": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: A site was purchased between Red Lion and White Cross streets and a new"
},
"LJSpeech_LJ013-0117": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:happy\ttranscription: and with another carry off the plate-chest in broad daylight and as a matter of business."
},
"LJSpeech_LJ027-0096": {
"labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: as indeed she must be according to the derivation theory."
},
"LJSpeech_LJ021-0035": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:happy\ttranscription: saved debtors and creditors alike in many other fields of enterprise."
},
"LJSpeech_LJ049-0159": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: the Secret Service, then the only federal investigative agency, assumed"
},
"LJSpeech_LJ039-0100": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:happy\ttranscription: During the first week of an intensive eight-week training period, he received and"
},
"LJSpeech_LJ019-0302": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:slow\temotion:neutral\ttranscription: In 1862, there were in all 193 jails."
},
"LJSpeech_LJ030-0229": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: At this point, the cars were speeding through the underpass and had left the scene of the"
},
"LJSpeech_LJ005-0151": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: In others, women were very properly exempted from it, and also from all severe"
},
"LJSpeech_LJ015-0115": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:fast\temotion:sad\ttranscription: Little more remains to be said about Robson. He appears to have accepted his position."
},
"LJSpeech_LJ010-0279": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: I shall mention briefly one more case, in which, however, there was no murderous"
},
"LJSpeech_LJ044-0119": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: into which Oswald, in his own words, had quote, thrown himself. He sought it"
},
"LJSpeech_LJ049-0115": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: of the person who is actually in the exercise of the executive power or"
},
"LJSpeech_LJ004-0139": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:fast\temotion:happy\ttranscription: In the morning, the stench and heat were so oppressive that he and everyone else"
},
"LJSpeech_LJ007-0223": {
"labels": "age:Elderly\tgender:male\tpitch:high\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: The prison officials appear to be on the side of the inspectors, to the great dissatisfaction"
},
"LJSpeech_LJ032-0070": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:normal\temotion:neutral\ttranscription: Ordinarily, Inspector Holmes testified, identification is not requested because"
},
"LJSpeech_LJ015-0294": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: He forgot to add that it was to be placed to Ralph's credit, and when he called"
},
"LJSpeech_LJ014-0083": {
"labels": "age:Elderly\tgender:male\tpitch:high\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: which, having possessed herself of the murdered man's keys, she rifled from"
},
"LJSpeech_LJ040-0195": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: It appears that he did not want to do any of the things which the authorities suggest"
},
"LJSpeech_LJ030-0028": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: and the Vice President and Mrs. Johnson were in the receiving line to greet President"
},
"LJSpeech_LJ009-0145": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: and he seemed to suffer great inward agitation when the ordinary, particularly"
},
"LJSpeech_LJ031-0117": {
"labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: assisted by doctors William Osborne and John Parker."
},
"LJSpeech_LJ004-0115": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:happy\ttranscription: Infirmaries separating the sexes were also to be provided. A chapel too."
},
"LJSpeech_LJ039-0200": {
"labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: And one of these agents, Robert A. Frazier,"
},
"LJSpeech_LJ028-0481": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:slow\temotion:happy\ttranscription: Nabo-Polisar, the father, my begetter, built Imgur-Bel."
},
"LJSpeech_LJ003-0245": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:slow\temotion:disgusted\ttranscription: preparatory to their appearance in the Old Bailey. Irons were seldom removed."
},
"LJSpeech_LJ017-0261": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:neutral\ttranscription: Seven were found guilty of murder on the high seas and one, Carlos, a criminal."
},
"LJSpeech_LJ007-0137": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: More attention to ventilation, which was altogether neglected and inadequate, would"
},
"LJSpeech_LJ041-0017": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: Wobel also recalled that Oswald once outlined a plan to cut the glass in the"
},
"LJSpeech_LJ043-0061": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: At the time of his defection, when he evidenced no interest in his father and"
},
"LJSpeech_LJ033-0097": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: Frazier parked the car in the company parking lot about two blocks north of the depository"
},
"LJSpeech_LJ019-0234": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:slow\temotion:sad\ttranscription: The old buildings were entirely disused, and the whole of the inmates of Newgate were"
},
"LJSpeech_LJ016-0296": {
"labels": "age:Middle-aged\tgender:male\tpitch:high\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: The actual execution made some impression."
},
"LJSpeech_LJ023-0072": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:neutral\ttranscription: Congress passed a statute which, in 1803, the courts"
},
"LJSpeech_LJ050-0100": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: The Commission recognizes that no set of meaningful criteria will yield"
},
"LJSpeech_LJ028-0113": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: With mortar and bricks, he built two moat walls about the city."
},
"LJSpeech_LJ013-0121": {
"labels": "age:Elderly\tgender:male\tpitch:high\tvolume:normal\tspeed:normal\temotion:sad\ttranscription: Howe and his accomplice were arrested. The former was found guilty and sentenced"
},
"LJSpeech_LJ014-0273": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: The detection of these frauds came while he was still prominently before the world as"
},
"LJSpeech_LJ027-0136": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: Illustrations quoted from the works of Romains and Locante will make this principle"
},
"LJSpeech_LJ003-0067": {
"labels": "age:Middle-aged\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: in the then existing state of the law,"
},
"LJSpeech_LJ039-0220": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: and that one would not have to be an expert marksman to have accomplished the assassination."
},
"LJSpeech_LJ018-0029": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: A photograph of Mueller shown the jeweler was identified as the likeness of the jewel."
},
"LJSpeech_LJ001-0080": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: He seems to have taken the letter of the Elseviers of the 17th century for his mom."
},
"LJSpeech_LJ028-0454": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: But both sections originally reached the river."
},
"LJSpeech_LJ037-0244": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: Westbrook identified Commission Exhibit 162 as the light-colored"
},
"LJSpeech_LJ008-0174": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:slow\temotion:sad\ttranscription: One cartload of spectators having broken down, some of its occupants fell"
},
"LJSpeech_LJ009-0302": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:sad\ttranscription: Another man was hired, himself a convict, whose fees for self and wife were"
},
"LJSpeech_LJ002-0182": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: the fleet and the Marshall Sea prisons especially devoted to them."
},
"LJSpeech_LJ048-0023": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: and he had told us during one of the interviews that he would probably take his wife back to"
},
"LJSpeech_LJ019-0257": {
"labels": "age:Elderly\tgender:male\tpitch:high\tvolume:low\tspeed:slow\temotion:sad\ttranscription: Here, the tread wheel was in use. There, cellular cranks."
},
"LJSpeech_LJ040-0046": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:slow\temotion:neutral\ttranscription: which one associate described as, quote, irrevocable, end quote."
},
"LJSpeech_LJ006-0295": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: The governor was also personally responsible for gross contravention of this rule of"
},
"LJSpeech_LJ028-0252": {
"labels": "age:Middle-aged\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: Only the king's son, Belshazzar, was killed."
},
"LJSpeech_LJ035-0002": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:slow\temotion:neutral\ttranscription: Chapter 4 The Assassin Part 4 Oswald's Actions"
},
"LJSpeech_LJ009-0189": {
"labels": "age:Elderly\tgender:male\tpitch:high\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: Persons were still living in 1855 who had witnessed dissections at Hicks Hall."
},
"LJSpeech_LJ011-0032": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: declared that they had hitherto formed a high opinion of his honor, integrity, and"
},
"LJSpeech_LJ042-0155": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: It appears to be the work of a fairly well-organized person."
},
"LJSpeech_LJ004-0039": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:sad\ttranscription: They were hopeless of any general reform by the action of the executive alone."
},
"LJSpeech_LJ025-0157": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: under these circumstances, unnatural as they are, with proper management"
},
"LJSpeech_LJ030-0121": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:normal\temotion:happy\ttranscription: Several times, Special Agent John D. Reddy came forward from the right front"
},
"LJSpeech_LJ010-0135": {
"labels": "age:Elderly\tgender:male\tpitch:high\tvolume:low\tspeed:fast\temotion:happy\ttranscription: yelled out three cheers to the populace whom he faced."
},
"LJSpeech_LJ050-0118": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: Since these agencies are already obliged constantly to evaluate the activities"
},
"LJSpeech_LJ040-0175": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:normal\temotion:neutral\ttranscription: through which they could not reach him, but that he preferred the veil to remain intact."
},
"LJSpeech_LJ013-0042": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: the foundations of which had been laid by buying old ships on purpose to cast the more"
},
"LJSpeech_LJ014-0076": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:happy\ttranscription: He was seen afterwards smoking and talking with his hosts in their back parlor."
},
"LJSpeech_LJ008-0097": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:normal\temotion:sad\ttranscription: No sooner was the job finished than half a dozen competitors appeared."
},
"LJSpeech_LJ043-0166": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: Possibly he might have wanted to be caught and wanted his involvement made clear"
},
"LJSpeech_LJ045-0127": {
"labels": "age:Elderly\tgender:male\tpitch:high\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: He absolved the Soviet embassy in Mexico City of any blame for his difficulties."
},
"LJSpeech_LJ027-0117": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:normal\temotion:happy\ttranscription: Now, here again the former theory appears to be triumphant over the latter."
},
"LJSpeech_LJ035-0076": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:happy\ttranscription: Special Agent John Howlett of the Secret Service carried a rifle from the south"
},
"LJSpeech_LJ005-0101": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:sad\ttranscription: Quince it deduced the practice and condition of every prison that replied."
},
"LJSpeech_LJ028-0302": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:happy\ttranscription: I will desert to the enemy as I am, and when I get into their city,"
},
"LJSpeech_LJ036-0196": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: Tippett patrolled District 78 in the Oak Cliff area of Dixie."
},
"LJSpeech_LJ028-0191": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:happy\ttranscription: The old enemies of Babylon rejoiced."
},
"LJSpeech_LJ038-0214": {
"labels": "age:Elderly\tgender:male\tpitch:high\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: and the other paragraphs instructed her on the disposal of Oswald's personal effects"
},
"LJSpeech_LJ003-0304": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: with the penalty of forfeiting the day's allowance of food, an increase of which the committee"
},
"LJSpeech_LJ019-0009": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: or to insist upon the construction of prisons on the most approved plan."
},
"LJSpeech_LJ022-0192": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: They contemplate the enrichment of our national life."
},
"LJSpeech_LJ007-0026": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:sad\ttranscription: And with all this, the most dreadful oaths, the worst language, too bad to be repeated."
},
"LJSpeech_LJ035-0158": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: A blue jacket, later identified by Marina Oswald as her husband's, was"
},
"LJSpeech_LJ033-0014": {
"labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:normal\tspeed:normal\temotion:sad\ttranscription: Lee Harvey Oswald lived in a rooming house in Dallas, while his wife and children lived"
},
"LJSpeech_LJ018-0369": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:normal\temotion:neutral\ttranscription: the mysterious Bravo case, that of Dr. Lamson, and that of"
},
"LJSpeech_LJ017-0028": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: that she had had a quarrel with her mistress, and that the latter, with all others,"
},
"LJSpeech_LJ004-0109": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: according to their categories or crimes."
},
"LJSpeech_LJ011-0239": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: Where robbery with violence was intended, the perpetrators had now to adopt various"
},
"LJSpeech_LJ049-0034": {
"labels": "age:Elderly\tgender:male\tpitch:high\tvolume:normal\tspeed:normal\temotion:surprised\ttranscription: could have reached the President in time to protect him from the second and fatal shot"
},
"LJSpeech_LJ016-0389": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: Giovanni Lanni, the Italian boy who murdered a French woman in the hay market."
},
"LJSpeech_LJ037-0018": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:slow\temotion:sad\ttranscription: Skaggins saw him and heard him mutter either, Poor damn cop or..."
},
"LJSpeech_LJ041-0163": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: Out of a combination of Oswald's known Marxist sympathies and George Orwell's"
},
"LJSpeech_LJ017-0178": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:surprised\ttranscription: It appeared that several persons with whom she was intimate had succumbed suddenly."
},
"LJSpeech_LJ048-0006": {
"labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: or to the Vice President.\""
},
"LJSpeech_LJ002-0184": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: the latter two being also a prison for felons and vagrants arrested within certain"
},
"LJSpeech_LJ012-0143": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: Money Moses had received the stolen gold dust from Moss's father-in-law, Davis."
},
"LJSpeech_LJ023-0012": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:neutral\ttranscription: But when almost two years later it came before the Supreme"
},
"LJSpeech_LJ036-0068": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:sad\ttranscription: Both buses stopped within one block of the depository building."
},
"LJSpeech_LJ046-0158": {
"labels": "age:Elderly\tgender:male\tpitch:high\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: At the time of the assassination, the active PRS general files contained"
},
"LJSpeech_LJ011-0287": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: but at length managed to wriggle out of the chain which confined his body and"
},
"LJSpeech_LJ016-0399": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: Wainwright was allowed a cigar the night before execution, which he smoked in the prison yard."
},
"LJSpeech_LJ006-0012": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: but he would not arm them with any authority lest their cooperation might be offensive."
},
"LJSpeech_LJ005-0102": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: Upon these and the private visitations made by various members, the Society"
},
"LJSpeech_LJ001-0169": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: The paper used for printing the small, highly ornamented French service books."
},
"LJSpeech_LJ011-0271": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: Mr. G went in the coach sent for him and alighted at 27 York Street,"
},
"LJSpeech_LJ014-0057": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: London did not escape the contagion and, prominent among the detestable crimes of"
},
"LJSpeech_LJ008-0191": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:slow\temotion:disgusted\ttranscription: At Courvoisier's execution in 1840, it was the same, or worse."
},
"LJSpeech_LJ007-0120": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:sad\ttranscription: some to the infirmary, many more to the governor's house."
},
"LJSpeech_LJ012-0268": {
"labels": "age:Elderly\tgender:male\tpitch:high\tvolume:low\tspeed:normal\temotion:neutral\ttranscription: Suspicion grew almost to certainty as the evidence was unfolded."
},
"LJSpeech_LJ015-0298": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:happy\ttranscription: But while Hardwick was in communication with Sawward, the bank was in communication with"
},
"LJSpeech_LJ030-0213": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:neutral\ttranscription: Hill heard a second shot, approximately five seconds after the first, which removed"
},
"LJSpeech_LJ009-0121": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:normal\temotion:happy\ttranscription: for all thy goodness and loving kindness to us and to all men."
},
"LJSpeech_LJ037-0084": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: Barbara Jeanette Davis testified that no one had shown her a picture of Oswald before."
},
"LJSpeech_LJ009-0219": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:happy\ttranscription: A clause was inserted to the effect that"
},
"LJSpeech_LJ040-0143": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: Contrary to reports that appeared after the assassination, the psychiatric examiner"
},
"LJSpeech_LJ006-0048": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: To these were still added an average of about 50, expecting the last penalty."
},
"LJSpeech_LJ033-0006": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: In this connection, the Commission considered one"
},
"LJSpeech_LJ044-0207": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: the economic embargo against that country, and the general policy of the United States."
},
"LJSpeech_LJ032-0260": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: He thought it contained tent poles or possibly other camping equipment, such as"
},
"LJSpeech_LJ017-0030": {
"labels": "age:Elderly\tgender:male\tpitch:high\tvolume:high\tspeed:normal\temotion:happy\ttranscription: When the spread of scientific knowledge places nefarious means at the disposal of"
},
"LJSpeech_LJ005-0084": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:normal\temotion:neutral\ttranscription: so as to prevent them from seeing, conversing, or holding any interaction."
},
"LJSpeech_LJ008-0236": {
"labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: Coiled up on the floor of the scaffold like a serpent, the hangman's rope."
},
"LJSpeech_LJ027-0020": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:happy\ttranscription: The unity in life, then, is not less a fact than is life's great diversity."
},
"LJSpeech_LJ017-0238": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:fearful\ttranscription: Tefer the second mate agreed, but constantly went in fear of his life."
},
"LJSpeech_LJ045-0233": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:angry\ttranscription: He consistently refused to admit involvement in the assassination"
},
"LJSpeech_LJ038-0099": {
"labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: From the outset, Oswald denied owning a rifle."
},
"LJSpeech_LJ028-0231": {
"labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: whereupon they withdrew within their defenses."
},
"LJSpeech_LJ044-0239": {
"labels": "age:Elderly\tgender:male\tpitch:high\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: and raises serious questions as to whether or not he ever expected to"
},
"LJSpeech_LJ050-0086": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: Under these criteria, whether the case should be referred to the Secret Service depends"
},
"LJSpeech_LJ047-0021": {
"labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: information regarding his relations with the U.S. Embassy in Moscow and background"
}
}
================================================
FILE: AutomaticPipeline/outputs/labels_LJspeech_0.scp
================================================
LJSpeech LJ010-0227 68 male 968.5380249023438 0.8162265419960022 0.022741336633663366 sad He went from Newgate first to Bethlehem, from which he was removed to Broadmoor.
LJSpeech LJ050-0106 59 male 906.479736328125 0.8606864213943481 0.022741336633663366 neutral to devise a practical system which has any reasonable possibility of revealing
LJSpeech LJ043-0140 57 male 1299.147705078125 0.8262879848480225 0.022741336633663366 neutral He also studied Dallas bus schedules to prepare for his later use of
LJSpeech LJ018-0032 66 male 1098.609619140625 0.763823390007019 0.022741336633663366 sad There was no mystery about his departure. He had gone to Canada by the Victoria
LJSpeech LJ012-0117 62 male 988.9229736328125 0.8686652183532715 0.022741336633663366 neutral By and by the occupant of the room noticed something glittering in the center of the fire.
LJSpeech LJ038-0057 65 male 1146.2008056640625 0.838049590587616 0.022741336633663366 neutral Deputy Sheriff Walther's brought a shotgun into the theater, but laid it on some
LJSpeech LJ017-0158 62 male 951.9786376953125 0.8371432423591614 0.022741336633663366 happy She had a little fortune of her own, some 1,700 pounds
LJSpeech LJ026-0094 53 male 1013.83544921875 0.8242807984352112 0.022741336633663366 neutral This however is probably not a source of vital energy, but only contributes to the
LJSpeech LJ045-0152 65 male 928.56396484375 0.8731129169464111 0.024118640350877192 happy The events of that evening can best be appreciated through Marina Oswald's testimony.
LJSpeech LJ019-0264 67 male 1011.5900268554688 0.874998152256012 0.024177631578947367 neutral and it was decidedly of opinion that in all short sentences the hard labor
LJSpeech LJ040-0047 63 male 1101.93212890625 0.5375705361366272 0.016539692982456142 neutral That associate did not think that Oswald was a communist.
LJSpeech LJ012-0015 63 male 1143.9190673828125 0.8230547904968262 0.024177631578947367 neutral Weedon and LaCassar to 12 and 6 months respectively in cold baths.
LJSpeech LJ019-0097 63 male 976.0210571289062 0.868423581123352 0.024009146341463415 sad The want of uniformity in prison discipline became air long and acknowledged evil.
LJSpeech LJ039-0093 65 male 1167.13720703125 0.7914684414863586 0.024009146341463415 neutral at a distance of 265.3 feet was, quote,
LJSpeech LJ018-0028 63 female 1036.7177734375 0.8202313780784607 0.024009146341463415 sad who had been a lodger of his. Muehler had given the cabman's little daughter
LJSpeech LJ028-0287 66 male 1013.0613403320312 0.7936339974403381 0.024009146341463415 angry but found none by which he could hope to prevail unless he maimed himself.
LJSpeech LJ024-0117 65 female 746.5477294921875 0.8057168126106262 0.024093094405594404 neutral You will find that many of those who pretend to support you will sabotage your plans.
LJSpeech LJ029-0172 64 male 1144.1251220703125 0.8575349450111389 0.024093094405594404 neutral The two Dallas newspapers provided their readers with a steady stream of information and
LJSpeech LJ024-0044 59 male 1082.681884765625 0.8042803406715393 0.024093094405594404 angry that I will appoint justices who will act as justices and not as
LJSpeech LJ045-0246 60 male 916.866943359375 0.77986079454422 0.024093094405594404 sad He sought for himself a place in history, a role as the great
LJSpeech LJ042-0191 59 male 1080.6220703125 0.7938746213912964 0.026502403846153846 neutral While, quote, resourcefulness and patient working towards the aforesaid goal
LJSpeech LJ050-0099 66 male 1119.674072265625 0.8032099604606628 0.026502403846153846 neutral It is apparent that a good deal of further consideration and experimentation
LJSpeech LJ017-0101 63 male 889.54541015625 0.7770697474479675 0.026502403846153846 neutral That day, Palmer had bought more strychnia and had called in a fresh doctor.
LJSpeech LJ016-0388 56 male 941.869384765625 0.5061151385307312 0.01628389423076923 happy A few go further and are almost gluttonous.
LJSpeech LJ007-0129 66 male 657.7396850585938 0.7841930985450745 0.022816639072847682 angry the sane and the insane, the young and the old, the trivial offender and the
LJSpeech LJ014-0195 62 male 899.5761108398438 0.8505150675773621 0.022816639072847682 neutral Two of them supported Cope, who was still alive, although insensible, and Marley
LJSpeech LJ027-0056 60 male 1063.888671875 0.8467499017715454 0.02244308774834437 happy The relation of skeleton and muscle in arthropods is exactly the reverse.
LJSpeech LJ030-0227 61 male 1070.8468017578125 0.8109928965568542 0.022816639072847682 neutral Special Agent George W. Hickey Jr. in the rear seat of the Presidential
LJSpeech LJ047-0210 53 male 734.2883911132812 0.4516647458076477 0.013350046641791045 neutral I think all of those, if we had them all together...
LJSpeech LJ048-0228 69 male 914.8754272460938 0.8221648335456848 0.025711287313432835 neutral and others who were present say that no agent was inebriated or acted improperly.
LJSpeech LJ009-0104 63 male 910.867919921875 0.8145198225975037 0.025711287313432835 neutral The women set up a yell, which is mixed with a rustling noise, occasioned by the
LJSpeech LJ017-0085 61 male 1220.1307373046875 0.800680935382843 0.025711287313432835 neutral Palmer's plan was to administer poison in quantities insufficient to
LJSpeech LJ018-0275 66 male 1315.436279296875 0.8228096961975098 0.025520833333333333 neutral On Tarpey's defense, it was stated that the idea of the theft had been suggested
LJSpeech LJ027-0064 54 male 993.9934692382812 0.5912888646125793 0.019532638888888888 neutral All vertebrates and none other have two cavities.
LJSpeech LJ036-0084 59 male 1092.259521484375 0.8476306796073914 0.025520833333333333 neutral Craig also claimed that when Fritz pointed out to Oswald that Craig had identified
LJSpeech LJ040-0211 64 male 897.1492919921875 0.7998844981193542 0.025520833333333333 neutral This would make him approximately ten, well almost eleven years old.
LJSpeech LJ032-0241 59 male 979.5408935546875 0.8054627180099487 0.02630009541984733 happy Ten days prior to the Walker attempt, Oswald had undoubtedly received the rifle.
LJSpeech LJ002-0158 61 male 1082.5684814453125 0.8261975646018982 0.02630009541984733 neutral Other cases are recorded elsewhere, as at the Gilsburg Street Comptor, where
LJSpeech LJ050-0077 58 male 1372.689453125 0.3803936243057251 0.011823711832061068 neutral and in taking preventive steps.
LJSpeech LJ032-0236 64 male 1258.6331787109375 0.8708815574645996 0.02630009541984733 neutral By checking the actual mailing dates of these issues and the time it usually takes
LJSpeech LJ026-0102 58 male 1112.703857421875 0.7878386974334717 0.02505681818181818 happy but root pressure due to osmosis, capillary action and evaporation.
LJSpeech LJ025-0057 63 male 1100.768798828125 0.8493995666503906 0.02505681818181818 neutral such as the charai, are in constant and regular motion, was made out
LJSpeech LJ018-0270 61 male 1052.5274658203125 0.8861708641052246 0.02505681818181818 neutral The assistant called with the jewels on approbation at a house specially hired for
LJSpeech LJ013-0021 65 male 896.8339233398438 0.7397869229316711 0.02505681818181818 sad The larboard pump was suffered to remain choked up and the longboat was
LJSpeech LJ042-0050 62 male 895.3709716796875 0.8398865461349487 0.023122902684563757 sad I have been a pro-communist for years and yet I have never met a communist.
LJSpeech LJ006-0075 65 male 1124.275634765625 0.8591945767402649 0.023122902684563757 happy He saw certain rooms fill up, and yet took no steps to open others that were locked
LJSpeech LJ048-0187 62 male 1194.6612548828125 0.8206942677497864 0.023122902684563757 neutral In addition, Secret Service agents riding in the motorcade were trained to
LJSpeech LJ022-0186 59 male 1200.8306884765625 0.8014236688613892 0.023122902684563757 happy Twenty years of experience with this system have justified the efforts made to
LJSpeech LJ013-0078 67 male 1060.476318359375 0.8209429979324341 0.02173698738170347 neutral Barbara was subsequently pardoned, but was not replaced on the roles as an attorney.
LJSpeech LJ006-0015 62 male 1111.1729736328125 0.8129136562347412 0.02173698738170347 neutral one which required discretion, judgment and knowledge of law, with sufficient
LJSpeech LJ048-0061 64 male 1116.922119140625 0.8459006547927856 0.02173698738170347 neutral Under proper procedures, knowledge of the pending Presidential visit might have prompted
LJSpeech LJ015-0190 63 male 1128.6842041015625 0.7676864862442017 0.02173698738170347 sad that of dishonest rogues who assume piety and philanthropy as a cloak for the world.
LJSpeech LJ003-0115 62 male 1147.26904296875 0.832510232925415 0.023842993079584776 neutral The judge sat in proper form. He was punctiliously styled, my
LJSpeech LJ045-0244 67 male 1213.853271484375 0.797639787197113 0.023842993079584776 angry Long before the assassination, he expressed his hatred for American society.
LJSpeech LJ004-0051 66 male 850.4898681640625 0.8350055813789368 0.023842993079584776 happy When in Belgium he had examined with great satisfaction the admirable manner
LJSpeech LJ011-0265 62 male 1222.8548583984375 0.8727220892906189 0.023842993079584776 sad This Mr. Canning had left his widow a life interest in two thousand pounds.
LJSpeech LJ028-0087 53 male 983.9290161132812 0.5809707641601562 0.01661304151624549 neutral Such was the appearance of the builder of the walls of Babylon.
LJSpeech LJ028-0178 65 male 945.8756713867188 0.8222951292991638 0.024875902527075812 happy The walls of Babylon were so long and wide and high that all who
LJSpeech LJ011-0113 64 male 1133.138427734375 0.8332394957542419 0.024875902527075812 sad He met his death with unshaken firmness, only in treating that a certain blue
LJSpeech LJ039-0162 61 male 1100.3876953125 0.8697624802589417 0.024875902527075812 sad In an effort to test the rifle under conditions which simulated those which prevailed during
LJSpeech LJ045-0066 53 male 1083.7061767578125 0.3979305326938629 0.014269723360655738 sad Oswald struck his wife on occasion.
LJSpeech LJ028-0277 50 male 1167.56591796875 0.5111944675445557 0.017220543032786886 surprised One of his Sumter mules gave birth to a foal.
LJSpeech LJ048-0197 64 male 992.088134765625 0.8570470213890076 0.028240266393442622 neutral I then told the officers that their primary duty was traffic and crowd control and that
LJSpeech LJ019-0170 66 male 1029.4417724609375 0.8437999486923218 0.028240266393442622 neutral More officers were appointed, as the time of so many of those already on the staff
LJSpeech LJ041-0092 59 male 839.0108642578125 0.7990822792053223 0.030625 neutral and quote, individual that you would brainwash and quite easy, but I think
LJSpeech LJ039-0027 52 male 838.1315307617188 0.2564748227596283 0.00900361111111111 happy and switch with Azure.
LJSpeech LJ030-0077 64 male 1158.3037109375 0.809492826461792 0.030625 neutral Each agent carried a .38 caliber pistol and a shotgun and
LJSpeech LJ038-0112 64 male 1158.54931640625 0.7811505198478699 0.030625 neutral As previously indicated, Marina Oswald testified that she took the
LJSpeech LJ017-0119 62 male 1471.981689453125 0.7784394025802612 0.022372159090909092 happy All the circumstances were so suspicious that he could not escape the criminal charge.
LJSpeech LJ007-0242 63 male 898.47412109375 0.860797107219696 0.022161728896103895 neutral No complete and permanent improvement was indeed possible while Newgate remained unchanged.
LJSpeech LJ020-0046 66 male 936.8101806640625 0.7930992841720581 0.022372159090909092 happy Close the dough over it, dust your hands and kneading board with flour,
LJSpeech LJ018-0001 68 male 1067.754638671875 0.8335044980049133 0.022372159090909092 neutral The Chronicles of Newgate, Volume 2, by Arthur Griffiths. Section 21.
LJSpeech LJ026-0098 57 male 1459.13232421875 0.8028250336647034 0.02745268924302789 neutral The circulatory system distributes these foods in animals,
LJSpeech LJ021-0152 61 male 1135.7069091796875 0.7745066285133362 0.02745268924302789 neutral and to experiment for a reasonable time with measures suitable to
LJSpeech LJ047-0223 62 male 809.3446655273438 0.6974994540214539 0.02534586653386454 neutral I don't recall the exact date, it was about a week prior."
LJSpeech LJ007-0040 61 male 817.2888793945312 0.6683530211448669 0.020692480079681275 happy In a second room were fourteen more who had every hope of a reprieve.
LJSpeech LJ050-0103 63 male 908.8638305664062 0.795621395111084 0.02478642086330935 sad were all men who acted alone in their criminal acts against our leaders.
LJSpeech LJ037-0016 62 male 1227.5238037109375 0.8029683232307434 0.02478642086330935 neutral Scoggins hurriedly left his seat and hid behind the cab, as the man came back to
LJSpeech LJ028-0336 59 male 1117.00927734375 0.6433566808700562 0.018222347122302158 sad Darius now, still keeping to the plan agreed upon.
LJSpeech LJ004-0233 63 male 873.744384765625 0.8720524311065674 0.02478642086330935 neutral Under the new rule, visitors were not allowed to pass into the interior of the prison,
LJSpeech LJ024-0127 66 male 790.1196899414062 0.8227817416191101 0.025333180147058824 angry You who know me can have no fear that I would tolerate the destruction
LJSpeech LJ004-0189 67 male 882.4647216796875 0.8654625415802002 0.025333180147058824 happy The great principles of classification, cleanliness and employment were closely observed.
LJSpeech LJ005-0045 60 male 1204.24072265625 0.5985238552093506 0.017624310661764703 neutral Nor did it confine itself to mere verbal recommendations.
LJSpeech LJ005-0158 63 male 1125.4774169921875 0.658680260181427 0.021271369485294116 happy Bedding and clothing was still denied, but only in a few jails.
LJSpeech LJ011-0018 65 male 1102.3902587890625 0.8398444652557373 0.024434840425531915 neutral The crime, long carried on without detection, was first discovered in 1820.
LJSpeech LJ017-0022 63 male 950.7742919921875 0.7860767841339111 0.024434840425531915 neutral was that of Eliza Fenning, who was convicted of an attempt to poison a whole family.
LJSpeech LJ030-0136 58 male 961.9500122070312 0.7000767588615417 0.024434840425531915 sad The President replied, that is very obvious.
LJSpeech LJ039-0078 59 male 994.359375 0.811722457408905 0.024434840425531915 neutral The effect of a four-power telescopic sight on the difficulty of these shots
LJSpeech LJ048-0254 63 male 1418.6072998046875 0.8071127533912659 0.023045568561872908 neutral advised in the course of the Secret Service investigation of these events that each agent
LJSpeech LJ049-0083 63 male 1159.44287109375 0.8841239213943481 0.023045568561872908 angry It has long been a federal crime to conspire to injure any federal officer on account of
LJSpeech LJ006-0043 61 male 963.5223999023438 0.8555874228477478 0.023045568561872908 neutral The disgraceful overcrowding had been partially ended, but the same evils of
LJSpeech LJ025-0109 62 male 1129.241943359375 0.8006194829940796 0.023045568561872908 neutral has been as completely invalidated as the third and second.
LJSpeech LJ044-0163 62 male 957.9028930664062 0.8367945551872253 0.02478642086330935 neutral Marina Oswald testified that her husband engaged in fair play for Cuba.
LJSpeech LJ003-0037 65 male 1216.96533203125 0.8291783332824707 0.02478642086330935 neutral A site was purchased between Red Lion and White Cross streets and a new
LJSpeech LJ013-0117 61 male 1048.94189453125 0.8147414922714233 0.02478642086330935 happy and with another carry off the plate-chest in broad daylight and as a matter of business.
LJSpeech LJ027-0096 52 male 987.2114868164062 0.5842734575271606 0.01701371402877698 neutral as indeed she must be according to the derivation theory.
LJSpeech LJ021-0035 61 male 1195.374755859375 0.8064736127853394 0.024875902527075812 happy saved debtors and creditors alike in many other fields of enterprise.
LJSpeech LJ049-0159 67 male 1158.23046875 0.8147270679473877 0.024875902527075812 neutral the Secret Service, then the only federal investigative agency, assumed
LJSpeech LJ039-0100 60 male 1073.9599609375 0.7953296303749084 0.024875902527075812 happy During the first week of an intensive eight-week training period, he received and
LJSpeech LJ019-0302 66 male 927.6077270507812 0.7891265153884888 0.024875902527075812 neutral In 1862, there were in all 193 jails.
LJSpeech LJ030-0229 59 male 1128.42724609375 0.8108355402946472 0.0216686320754717 neutral At this point, the cars were speeding through the underpass and had left the scene of the
LJSpeech LJ005-0151 55 male 1023.0142211914062 0.8571876287460327 0.0216686320754717 neutral In others, women were very properly exempted from it, and also from all severe
LJSpeech LJ015-0115 69 male 1087.5670166015625 0.8020297884941101 0.0216686320754717 sad Little more remains to be said about Robson. He appears to have accepted his position.
LJSpeech LJ010-0279 66 male 988.0106201171875 0.8697272539138794 0.0216686320754717 neutral I shall mention briefly one more case, in which, however, there was no murderous
LJSpeech LJ044-0119 62 male 947.2424926757812 0.8160279393196106 0.022372159090909092 neutral into which Oswald, in his own words, had quote, thrown himself. He sought it
LJSpeech LJ049-0115 65 male 1047.099853515625 0.8133270740509033 0.022372159090909092 neutral of the person who is actually in the exercise of the executive power or
LJSpeech LJ004-0139 64 male 965.0425415039062 0.8386383652687073 0.022372159090909092 happy In the morning, the stench and heat were so oppressive that he and everyone else
LJSpeech LJ007-0223 61 male 1323.4381103515625 0.8278164267539978 0.022372159090909092 neutral The prison officials appear to be on the side of the inspectors, to the great dissatisfaction
LJSpeech LJ032-0070 67 male 1133.4202880859375 0.764536440372467 0.023122902684563757 neutral Ordinarily, Inspector Holmes testified, identification is not requested because
LJSpeech LJ015-0294 64 male 1171.9451904296875 0.816940426826477 0.023122902684563757 neutral He forgot to add that it was to be placed to Ralph's credit, and when he called
LJSpeech LJ014-0083 60 male 1259.9434814453125 0.8244383335113525 0.023122902684563757 neutral which, having possessed herself of the murdered man's keys, she rifled from
LJSpeech LJ040-0195 67 male 1063.7657470703125 0.8319800496101379 0.023122902684563757 neutral It appears that he did not want to do any of the things which the authorities suggest
LJSpeech LJ030-0028 57 male 1052.3692626953125 0.8686795830726624 0.024093094405594404 neutral and the Vice President and Mrs. Johnson were in the receiving line to greet President
LJSpeech LJ009-0145 63 male 1096.4228515625 0.8129713535308838 0.024093094405594404 neutral and he seemed to suffer great inward agitation when the ordinary, particularly
LJSpeech LJ031-0117 50 male 923.3846435546875 0.5800350308418274 0.01670563811188811 neutral assisted by doctors William Osborne and John Parker.
LJSpeech LJ004-0115 64 male 937.3268432617188 0.8438901305198669 0.024093094405594404 happy Infirmaries separating the sexes were also to be provided. A chapel too.
LJSpeech LJ039-0200 50 male 1002.3143310546875 0.47214874625205994 0.014967765748031497 neutral And one of these agents, Robert A. Frazier,
LJSpeech LJ028-0481 59 male 761.4593505859375 0.7362769842147827 0.027128444881889764 happy Nabo-Polisar, the father, my begetter, built Imgur-Bel.
LJSpeech LJ003-0245 68 male 855.0972290039062 0.8284653425216675 0.027128444881889764 disgusted preparatory to their appearance in the Old Bailey. Irons were seldom removed.
LJSpeech LJ017-0261 55 male 1154.7520751953125 0.7629727721214294 0.027128444881889764 neutral Seven were found guilty of murder on the high seas and one, Carlos, a criminal.
LJSpeech LJ007-0137 62 male 1012.0001220703125 0.8231872320175171 0.021805775316455698 neutral More attention to ventilation, which was altogether neglected and inadequate, would
LJSpeech LJ041-0017 57 male 845.4912719726562 0.8081146478652954 0.021805775316455698 neutral Wobel also recalled that Oswald once outlined a plan to cut the glass in the
LJSpeech LJ043-0061 65 male 1211.890625 0.7867423295974731 0.021805775316455698 neutral At the time of his defection, when he evidenced no interest in his father and
LJSpeech LJ033-0097 60 male 961.7119750976562 0.8147653937339783 0.021805775316455698 neutral Frazier parked the car in the company parking lot about two blocks north of the depository
LJSpeech LJ019-0234 60 male 982.4412841796875 0.8467894792556763 0.0275625 sad The old buildings were entirely disused, and the whole of the inmates of Newgate were
LJSpeech LJ016-0296 54 male 1260.2855224609375 0.4604700207710266 0.01482325 neutral The actual execution made some impression.
LJSpeech LJ023-0072 69 male 1206.0687255859375 0.7865973711013794 0.0275625 neutral Congress passed a statute which, in 1803, the courts
LJSpeech LJ050-0100 67 male 934.5872192382812 0.8466907143592834 0.0275625 neutral The Commission recognizes that no set of meaningful criteria will yield
LJSpeech LJ028-0113 63 male 1025.2725830078125 0.6758575439453125 0.018996001683501684 neutral With mortar and bricks, he built two moat walls about the city.
LJSpeech LJ013-0121 65 male 1238.6756591796875 0.7939990162849426 0.023200757575757576 sad Howe and his accomplice were arrested. The former was found guilty and sentenced
LJSpeech LJ014-0273 63 male 1042.2625732421875 0.8731350302696228 0.023200757575757576 neutral The detection of these frauds came while he was still prominently before the world as
LJSpeech LJ027-0136 63 male 1052.698974609375 0.8104369044303894 0.023200757575757576 neutral Illustrations quoted from the works of Romains and Locante will make this principle
LJSpeech LJ003-0067 52 male 1101.8662109375 0.4360215961933136 0.011485659246575342 neutral in the then existing state of the law,
LJSpeech LJ039-0220 65 male 1079.593994140625 0.8265351057052612 0.023598030821917807 neutral and that one would not have to be an expert marksman to have accomplished the assassination.
LJSpeech LJ018-0029 63 male 892.7648315429688 0.8415856957435608 0.023598030821917807 neutral A photograph of Mueller shown the jeweler was identified as the likeness of the jewel.
LJSpeech LJ001-0080 70 male 945.280517578125 0.8912354707717896 0.023598030821917807 neutral He seems to have taken the letter of the Elseviers of the 17th century for his mom.
LJSpeech LJ028-0454 59 male 1161.063720703125 0.4748651683330536 0.01538449074074074 neutral But both sections originally reached the river.
LJSpeech LJ037-0244 61 male 1057.122802734375 0.8424121737480164 0.025520833333333333 neutral Westbrook identified Commission Exhibit 162 as the light-colored
LJSpeech LJ008-0174 64 male 970.2979125976562 0.8039829730987549 0.025520833333333333 sad One cartload of spectators having broken down, some of its occupants fell
LJSpeech LJ009-0302 62 male 1106.8897705078125 0.7679134011268616 0.025520833333333333 sad Another man was hired, himself a convict, whose fees for self and wife were
LJSpeech LJ002-0182 60 male 1128.78076171875 0.7411836981773376 0.021869359205776172 neutral the fleet and the Marshall Sea prisons especially devoted to them.
LJSpeech LJ048-0023 61 male 983.9628295898438 0.8298522233963013 0.024875902527075812 neutral and he had told us during one of the interviews that he would probably take his wife back to
LJSpeech LJ019-0257 65 male 1225.7010498046875 0.7593720555305481 0.024875902527075812 sad Here, the tread wheel was in use. There, cellular cranks.
LJSpeech LJ040-0046 67 male 991.2825927734375 0.7420750856399536 0.024875902527075812 neutral which one associate described as, quote, irrevocable, end quote.
LJSpeech LJ006-0295 60 male 1199.44775390625 0.8367014527320862 0.02745268924302789 neutral The governor was also personally responsible for gross contravention of this rule of
LJSpeech LJ028-0252 51 male 1160.4306640625 0.45107951760292053 0.014317978087649402 neutral Only the king's son, Belshazzar, was killed.
LJSpeech LJ035-0002 64 male 921.927734375 0.7000311613082886 0.02745268924302789 neutral Chapter 4 The Assassin Part 4 Oswald's Actions
LJSpeech LJ009-0189 65 male 1282.5374755859375 0.8760916590690613 0.02745268924302789 neutral Persons were still living in 1855 who had witnessed dissections at Hicks Hall.
LJSpeech LJ011-0032 61 male 919.7144165039062 0.8366391062736511 0.02452179715302491 neutral declared that they had hitherto formed a high opinion of his honor, integrity, and
LJSpeech LJ042-0155 61 male 1077.17431640625 0.6541056036949158 0.018654137010676156 neutral It appears to be the work of a fairly well-organized person.
LJSpeech LJ004-0039 55 male 796.300537109375 0.7321549654006958 0.02127335409252669 sad They were hopeless of any general reform by the action of the executive alone.
LJSpeech LJ025-0157 61 male 1216.162353515625 0.8438274264335632 0.02452179715302491 neutral under these circumstances, unnatural as they are, with proper management
LJSpeech LJ030-0121 56 male 970.6068725585938 0.7965176701545715 0.02469758064516129 happy Several times, Special Agent John D. Reddy came forward from the right front
LJSpeech LJ010-0135 60 male 1289.2467041015625 0.5666248798370361 0.019533378136200718 happy yelled out three cheers to the populace whom he faced.
LJSpeech LJ050-0118 67 male 922.9232788085938 0.8592643141746521 0.02469758064516129 neutral Since these agencies are already obliged constantly to evaluate the activities
LJSpeech LJ040-0175 60 male 1017.7406005859375 0.7815579175949097 0.02331832437275986 neutral through which they could not reach him, but that he preferred the veil to remain intact.
LJSpeech LJ013-0042 62 male 1047.09814453125 0.8341838717460632 0.023279138513513514 neutral the foundations of which had been laid by buying old ships on purpose to cast the more
LJSpeech LJ014-0076 61 male 1151.91357421875 0.835938036441803 0.023279138513513514 happy He was seen afterwards smoking and talking with his hosts in their back parlor.
LJSpeech LJ008-0097 64 male 1172.0582275390625 0.7829859852790833 0.023279138513513514 sad No sooner was the job finished than half a dozen competitors appeared.
LJSpeech LJ043-0166 64 male 824.7908325195312 0.7988844513893127 0.023279138513513514 neutral Possibly he might have wanted to be caught and wanted his involvement made clear
LJSpeech LJ045-0127 64 male 1247.29638671875 0.8624638915061951 0.023122902684563757 neutral He absolved the Soviet embassy in Mexico City of any blame for his difficulties.
LJSpeech LJ027-0117 65 male 964.4253540039062 0.79942786693573 0.023122902684563757 happy Now, here again the former theory appears to be triumphant over the latter.
LJSpeech LJ035-0076 63 male 1200.859130859375 0.8349494338035583 0.023122902684563757 happy Special Agent John Howlett of the Secret Service carried a rifle from the south
LJSpeech LJ005-0101 58 male 1129.3095703125 0.7242733836174011 0.0196302432885906 sad Quince it deduced the practice and condition of every prison that replied.
LJSpeech LJ028-0302 64 male 1078.13818359375 0.7867964506149292 0.027235671936758892 happy I will desert to the enemy as I am, and when I get into their city,
LJSpeech LJ036-0196 65 male 1095.0950927734375 0.803264319896698 0.027235671936758892 neutral Tippett patrolled District 78 in the Oak Cliff area of Dixie.
LJSpeech LJ028-0191 60 male 1061.2279052734375 0.42850354313850403 0.013888586956521738 happy The old enemies of Babylon rejoiced.
LJSpeech LJ038-0214 57 male 1284.11328125 0.8420044779777527 0.027235671936758892 neutral and the other paragraphs instructed her on the disposal of Oswald's personal effects
LJSpeech LJ003-0304 65 male 832.8008422851562 0.8691624999046326 0.023760775862068966 neutral with the penalty of forfeiting the day's allowance of food, an increase of which the committee
LJSpeech LJ019-0009 56 male 1075.1240234375 0.7964742183685303 0.02298556034482759 neutral or to insist upon the construction of prisons on the most approved plan.
LJSpeech LJ022-0192 57 male 1077.5343017578125 0.5187281370162964 0.014764870689655173 neutral They contemplate the enrichment of our national life.
LJSpeech LJ007-0026 58 male 1158.8209228515625 0.793979287147522 0.023760775862068966 sad And with all this, the most dreadful oaths, the worst language, too bad to be repeated.
LJSpeech LJ035-0158 66 male 997.3316040039062 0.8228869438171387 0.023679123711340205 neutral A blue jacket, later identified by Marina Oswald as her husband's, was
LJSpeech LJ033-0014 54 male 959.2794799804688 0.8163527250289917 0.023679123711340205 sad Lee Harvey Oswald lived in a rooming house in Dallas, while his wife and children lived
LJSpeech LJ018-0369 66 male 1060.0634765625 0.7843106389045715 0.023679123711340205 neutral the mysterious Bravo case, that of Dr. Lamson, and that of
LJSpeech LJ017-0028 61 male 1008.4209594726562 0.8143091797828674 0.023679123711340205 neutral that she had had a quarrel with her mistress, and that the latter, with all others,
LJSpeech LJ004-0109 57 male 1024.0537109375 0.4420858919620514 0.013042491007194245 neutral according to their categories or crimes.
LJSpeech LJ011-0239 63 male 950.5305786132812 0.8259799480438232 0.02478642086330935 neutral Where robbery with violence was intended, the perpetrators had now to adopt various
LJSpeech LJ049-0034 62 male 1265.04345703125 0.8034409284591675 0.02478642086330935 surprised could have reached the President in time to protect him from the second and fatal shot
LJSpeech LJ016-0389 65 male 846.389892578125 0.8337498307228088 0.024438174460431655 neutral Giovanni Lanni, the Italian boy who murdered a French woman in the hay market.
LJSpeech LJ037-0018 58 male 978.4235229492188 0.7971556186676025 0.029197563559322032 sad Skaggins saw him and heard him mutter either, Poor damn cop or...
LJSpeech LJ041-0163 65 male 961.3748779296875 0.867605984210968 0.029197563559322032 neutral Out of a combination of Oswald's known Marxist sympathies and George Orwell's
LJSpeech LJ017-0178 61 male 1041.3836669921875 0.7767281532287598 0.029197563559322032 surprised It appeared that several persons with whom she was intimate had succumbed suddenly.
LJSpeech LJ048-0006 54 male 986.5155639648438 0.377620667219162 0.014482256355932204 neutral or to the Vice President."
LJSpeech LJ002-0184 64 male 914.2891845703125 0.8724705576896667 0.025333180147058824 neutral the latter two being also a prison for felons and vagrants arrested within certain
LJSpeech LJ012-0143 65 male 1187.48974609375 0.8944374322891235 0.025333180147058824 neutral Money Moses had received the stolen gold dust from Moss's father-in-law, Davis.
LJSpeech LJ023-0012 59 male 1082.41943359375 0.7502818703651428 0.025333180147058824 neutral But when almost two years later it came before the Supreme
LJSpeech LJ036-0068 63 male 865.4218139648438 0.6849870085716248 0.02191842830882353 sad Both buses stopped within one block of the depository building.
LJSpeech LJ046-0158 66 male 1267.1650390625 0.8534243106842041 0.02222782258064516 neutral At the time of the assassination, the active PRS general files contained
LJSpeech LJ011-0287 60 male 1136.369384765625 0.8295741677284241 0.02222782258064516 neutral but at length managed to wriggle out of the chain which confined his body and
LJSpeech LJ016-0399 65 male 1150.46533203125 0.8492998480796814 0.02222782258064516 neutral Wainwright was allowed a cigar the night before execution, which he smoked in the prison yard.
LJSpeech LJ006-0012 61 male 944.3914794921875 0.8449038863182068 0.02222782258064516 neutral but he would not arm them with any authority lest their cooperation might be offensive.
LJSpeech LJ005-0102 62 male 1208.780029296875 0.8509426712989807 0.02194466560509554 neutral Upon these and the private visitations made by various members, the Society
LJSpeech LJ001-0169 66 male 1138.5301513671875 0.825796365737915 0.02194466560509554 neutral The paper used for printing the small, highly ornamented French service books.
LJSpeech LJ011-0271 66 male 1177.6123046875 0.8682052493095398 0.02194466560509554 neutral Mr. G went in the coach sent for him and alighted at 27 York Street,
LJSpeech LJ014-0057 65 male 1001.4357299804688 0.8337165117263794 0.02194466560509554 neutral London did not escape the contagion and, prominent among the detestable crimes of
LJSpeech LJ008-0191 65 male 960.4435424804688 0.7966058254241943 0.025904605263157895 disgusted At Courvoisier's execution in 1840, it was the same, or worse.
LJSpeech LJ007-0120 60 male 1031.863525390625 0.5902575850486755 0.018803806390977444 sad some to the infirmary, many more to the governor's house.
LJSpeech LJ012-0268 56 male 1237.5946044921875 0.7429749965667725 0.023254934210526317 neutral Suspicion grew almost to certainty as the evidence was unfolded.
LJSpeech LJ015-0298 70 male 1055.9366455078125 0.7635851502418518 0.025904605263157895 happy But while Hardwick was in communication with Sawward, the bank was in communication with
LJSpeech LJ030-0213 62 male 1213.4908447265625 0.7537742257118225 0.02620009505703422 neutral Hill heard a second shot, approximately five seconds after the first, which removed
LJSpeech LJ009-0121 61 male 1158.9774169921875 0.7147454023361206 0.02333769011406844 happy for all thy goodness and loving kindness to us and to all men.
LJSpeech LJ037-0084 63 male 1030.7886962890625 0.796912431716919 0.02620009505703422 neutral Barbara Jeanette Davis testified that no one had shown her a picture of Oswald before.
LJSpeech LJ009-0219 56 male 1173.5 0.4054138660430908 0.01342134030418251 happy A clause was inserted to the effect that
LJSpeech LJ040-0143 64 male 1050.6236572265625 0.8078874945640564 0.023598030821917807 neutral Contrary to reports that appeared after the assassination, the psychiatric examiner
LJSpeech LJ006-0048 62 male 1121.3856201171875 0.8324679732322693 0.023598030821917807 neutral To these were still added an average of about 50, expecting the last penalty.
LJSpeech LJ033-0006 58 male 1046.85986328125 0.5671167969703674 0.01674593321917808 neutral In this connection, the Commission considered one
LJSpeech LJ044-0207 64 male 804.4202270507812 0.8754435181617737 0.023598030821917807 neutral the economic embargo against that country, and the general policy of the United States.
LJSpeech LJ032-0260 62 male 1013.1453247070312 0.8036773204803467 0.023598030821917807 neutral He thought it contained tent poles or possibly other camping equipment, such as
LJSpeech LJ017-0030 63 male 1237.06982421875 0.842721164226532 0.023598030821917807 happy When the spread of scientific knowledge places nefarious means at the disposal of
LJSpeech LJ005-0084 62 male 1217.72900390625 0.7495614290237427 0.023598030821917807 neutral so as to prevent them from seeing, conversing, or holding any interaction.
LJSpeech LJ008-0236 52 male 1014.2316284179688 0.7807179093360901 0.02271853595890411 neutral Coiled up on the floor of the scaffold like a serpent, the hangman's rope.
LJSpeech LJ027-0020 60 male 1027.0858154296875 0.7961769104003906 0.026502403846153846 happy The unity in life, then, is not less a fact than is life's great diversity.
LJSpeech LJ017-0238 66 male 1097.978515625 0.7882980704307556 0.026502403846153846 fearful Tefer the second mate agreed, but constantly went in fear of his life.
LJSpeech LJ045-0233 66 male 1085.60400390625 0.800809919834137 0.026502403846153846 angry He consistently refused to admit involvement in the assassination
LJSpeech LJ038-0099 48 male 939.3925170898438 0.5590772032737732 0.019237740384615384 neutral From the outset, Oswald denied owning a rifle.
LJSpeech LJ028-0231 59 male 916.1470947265625 0.5505949854850769 0.01667814781021898 neutral whereupon they withdrew within their defenses.
LJSpeech LJ044-0239 66 male 1262.919677734375 0.8477627635002136 0.025148266423357664 neutral and raises serious questions as to whether or not he ever expected to
LJSpeech LJ050-0086 65 male 1170.1837158203125 0.8474505543708801 0.025148266423357664 neutral Under these criteria, whether the case should be referred to the Secret Service depends
LJSpeech LJ047-0021 66 male 1138.4693603515625 0.8763906955718994 0.025148266423357664 neutral information regarding his relations with the U.S. Embassy in Moscow and background
================================================
FILE: README.md
================================================
# SpeechCraft
This is the official repository of the ACM Multimedia 2024 paper *"SpeechCraft: A Fine-Grained Expressive Speech Dataset with Natural Language Description"*.
For details of the pipeline and dataset, please refer to our [Paper](http://arxiv.org/abs/2408.13608) and [Demo Page](https://speechcraft2024.github.io/speechcraft2024/)
<!-- Dataset and pipeline are coming soon. -->
## News
[2024-09-26]: **Structured metadata** (pitch, energy, speed, age, gender, emotion tone, emphasis, topic/category, and transcript) has been made available to facilitate further enhancements and augmentations of the dataset.
[2024-12-20]: Code and checkpoint of the **Annotation pipeline** are released.
## SpeechCraft Dataset
### 1. Download Speech Corpus
|Language|Speech Corpus|#Duration|#Clips|
|:--------:|:--------:|--------:|--------:|
|ZH|[Zhvoice](https://github.com/fighting41love/zhvoice)|799.68h|1,020,427|
|ZH|[AISHELL-3](https://www.openslr.org/93/)|63.70h|63,011|
|EN|[GigaSpeech-M](https://huggingface.co/datasets/speechcolab/gigaspeech/tree/main/data/audio/m_files_additional)|739.91h|670,070|
|EN|[LibriTTS-R](https://www.openslr.org/141/)|548.88h|352,265|
<!-- ## Metadata Walkthrough -->
### 2. Download Speech Annotation
||Description|Instruction|Labels|
|:--------:|:--------:|:--------:|:--------:|
|ZH|[download](https://cloud.tsinghua.edu.cn/f/e66664542f534f399802/?dl=1)|[download](https://cloud.tsinghua.edu.cn/f/d6f00e027f504751b4c0/?dl=1)|[download](https://cloud.tsinghua.edu.cn/f/02a69d7c862e4422850e/?dl=1)|
|EN|[download](https://cloud.tsinghua.edu.cn/f/517428835bd5486e87e8/?dl=1)|[download](https://cloud.tsinghua.edu.cn/f/cce83dd884ed4104b1a1/?dl=1)|[download](https://cloud.tsinghua.edu.cn/f/6f05dcbcfb384ea1870b/?dl=1)|
### 3. Labels and Prompts
#### EN Version
- `--gender`: Male, Female
- `--age`: Child, Teenager, Youth adult, Middle-aged, Elderly
- `--pitch`: low, normal, high
- `--speed`: slow, normal, fast
- `--volume`: low, normal, high
- `--emotion (English)`: Fearful, Happy, Disgusted, Sad, Surprised, Angry, Neutral
- `--emphasis`: Non-label words
- `--transcript`: Non-label sentence
- `--LLM Prompt`:
```Given the pitch, volume, age, gender, tone, and transcript, use sentiment analysis techniques to describe in natural language what age, what gender of a person, with what kind of emotion and tone, using what kind of pitch and volume, spoke the words in the transcript.
Note: You must vividly describe the sentence’s intonation, pitch, tone, and emotion. All outputs must strictly avoid identical wording and sentence structure. There is no need to describe body language or psychological state and do not repeat the input content.
Refer to the format of the following four cases:
*Example Input - Example Output*
Now try to process the following sentences, directly output the converted sentences according to the examples without missing any labels.
```
#### ZH Version
- `--年龄`:儿童,少年,青年,中年,老年
- `--性别`:男,女
- `--语速`:快,中,慢
- `--音高`:高,中,低
- `--音量`:高,中,低
- `--重读`:无标签,字词
- `--语气`:无标签,自然语句
- `--文本`:无标签,自然语句
- `--LLM Prompt`:
```
请参照以下转换案例,使用中文自然语言描述一个人按照给定风格属性,如音高、音量、年龄、性别、语调,来说文本中的话。注意,仅描述说话风格,不需要描述肢体动作或心理状态,不要重复输入的内容。
*示例输入-示例输出*
现在尝试处理以下句子,根据示例直接输出转换后的句子,不要遗漏任何标签。
```
### 4. Request Access to Emphasis Speech Dataset
Since we do not own the copyright of the original audio files, for researchers and educators who wish to use the audio files for non-commercial research and/or educational purposes, we can provide access to our regenerated version under certain conditions and terms. To apply for the AISHELL-3 and LibriTTS-R with fine-grained keyword emphasis, please fill out the EULA form at `Emphasis-SpeechCraft-EULA.pdf` and send the scanned form to jinzeyu23@mails.tsinghua.edu.cn. Once approved, you will be supplied with a download link. **([2024-09-26]: With metadata updated!)**
Please first refer to some emphasis examples provided [here](https://speechcraft2024.github.io/speechcraft2024/#13-examples-of-the-regenerated-emphasis-data-from-aishell-3-and-libritts-r). We are actively working on improving methods for large-scale fine-grained data construction that align with human perception.
|Language|Speech Corpus|#Duration|#Clips|
|:--------:|:--------:|--------:|--------:|
|ZH|AISHELL-3-stress|50.59h|63,258|
|EN|LibriTTS-R-stress|148.78h|75,654|
## Annotation Pipeline
### Step 0 : Installation
1. Download models for speech style recognition.
Models from 🤗:
```
llama_base_model = "baichuan-inc/Baichuan2-13B-Base"
gender_model_path = "alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech"
age_model_path = "audeering/wav2vec2-large-robust-24-ft-age-gender"
asr_path = "openai/whisper-medium" / "openai/whisper-large-v3"
```
Model from funasr (for English emotion classification):
```
emotion_model = "iic/emotion2vec_base_finetuned"
```
Prepare SECap from [here](https://github.com/thuhcsi/SECap) (for Chinese emotion captioning).
2. Create conda environment
```
conda env create -f ./requirements.yaml
mv ./AutomaticPipeline/models/SECap/model2.py $your_SECap_dir
```
3. Download the lora ckpt from [here](https://cloud.tsinghua.edu.cn/d/548948399d7c4816b677/) as `./llama-ft/finetuned-llama/` for description rewriting.
Remember to change the path of LLM ckpt at "`base_model_name_or_path`" in `./llama-ft/finetuned-llama/adapter_config.json`.
### Step 1 : Labeling with the Automatic Annotation Pipeline
1. Get the scp file with raw scores for the audio corpus.
```
cd ./AutomaticPipeline
python AutoPipeline.py
```
2. Get the json file with classified result prepared for the description rewriting.
```
python Clustering.py
```
### Step 2 : Rewriting with the Finetuned Llama
```
cd ../llama-ft
python llama_infer.py
```
## Citation
Please cite our paper if you find this work useful:
```
@inproceedings{jin2024speechcraft,
title={Speechcraft: A fine-grained expressive speech dataset with natural language description},
author={Jin, Zeyu and Jia, Jia and Wang, Qixin and Li, Kehan and Zhou, Shuoyi and Zhou, Songtao and Qin, Xiaoyu and Wu, Zhiyong},
booktitle={Proceedings of the 32nd ACM International Conference on Multimedia},
pages={1255--1264},
year={2024}
}
```
================================================
FILE: llama-ft/llama_infer.py
================================================
import os
import torch
import argparse
from accelerate import Accelerator
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import AutoPeftModelForCausalLM
import json
from tqdm import tqdm
import random
import torch.nn as nn
import torch.distributed as dist
from torch.utils.data import DataLoader, Subset
torch.multiprocessing.set_start_method('spawn', force=True)
PROMPT_DICT = {
"prompt_input": (
"Below is an instruction that describes a task, paired with an input that provides further context. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
),
"prompt_no_input": (
"Below is an instruction that describes a task. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Response:"
),
"chinese": (
"Below is an instruction that describes a task. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n给定音高、音量、年龄、性别、语气等信息以及文本,运用情感分析的技巧,用中文自然语言描述。"
"注意必须生动且多样化地描述,不需要描述肢体动作或心理状态,不要重复input内容。\n\n"
"###Input:\n{labels}\n\n### Response:"
),
"english": (
"Below is an instruction that describes a task. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\nGiven information such as pitch, volume, age, gender, tone, category and text, describe using English natural language with the techniques of emotional analysis."
"Make sure the description is vivid and diverse, without the need to describe physical actions or psychological states, and avoid repeating the input content.\n\n"
"###Input:\n{labels}\n\n### Response:"
)
}
class Dataset(torch.utils.data.Dataset):
def __init__(self, args):
self.language = args.language
self.ckpt_path = args.ckpt_path
self.testdata = json.load(open(args.json_path))
self.keyindex = list(self.testdata.keys())
self.tokenizer = AutoTokenizer.from_pretrained(args.ckpt_path, use_fast=False, trust_remote_code=True)
def __len__(self):
return len(self.testdata)
def __getitem__(self, index):
user_tokens=[195]
assistant_tokens=[196]
key = self.keyindex[index]
value = self.testdata[key]
tags = value['labels'].strip().split('\t')
# shuffle tags
random.shuffle(tags)
value['labels'] = '\t'.join(tags)
prompt = '<reserved_106>' + PROMPT_DICT[self.language].format_map(value) + '<reserved_107>'
inputs = self.tokenizer(prompt, return_tensors='pt')
return inputs, key, prompt, value['labels']
def extract(input_text, language):
if language=='chinese':
if "“" not in input_text:
return None
else:
start_index = input_text.find("“")
end_index = input_text.find("”", start_index+1)
return input_text[start_index:end_index]
else:
if input_text.count("\"") < 2:
return None
else:
start_index = input_text.find("\"")
end_index = input_text.rfind("\"")
return input_text[start_index+1:end_index]
def inference_on_device(args, tokenizer, device, dataloader):
language = args.language
output_path = args.output_path
error_path = args.error_path
ckpt_path = args.ckpt_path
model = AutoPeftModelForCausalLM.from_pretrained(
ckpt_path,
revision="v2.0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
)
# model = model.quantize(4)
model.to(device)
top_p = 0.3
temperature = 0.7
count = 0
test_result = {}
error = {}
output_path = output_path[:-4]+str(device)+'.json'
with torch.no_grad():
for inputs, keys, prompts, labels in tqdm(dataloader):
# unwrap the input
inputs = {key: value[0].to(device, dtype=torch.long) for key, value in inputs.items()}
# inputs = {key: value[0].to(device, dtype=torch.bfloat16) for key, value in inputs.items()}
key = keys[0]
prompt = prompts[0]
info = {}
labels = labels[0]
try:
pred = model.sample(**inputs, max_new_tokens=128, repetition_penalty=1.1, do_sample=True, use_cache=True, temperature=temperature, top_p=top_p)
generation = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)[len(prompt):]
ct = 0
while (extract(generation, language) is None or (len(generation)-len(extract(generation, language))<3)) and ct<5:
pred = model.sample(**inputs, max_new_tokens=128, repetition_penalty=1.1, do_sample=True, use_cache=True, temperature=temperature, top_p=top_p)
generation = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)[len(prompt):]
ct+=1
if ct==5:
with open(error_path, 'a') as file:
file.write(key+'\n')
continue
info['labels'] = labels
info['llama-instruction'] = generation
test_result[key] = info
count+=1
except Exception as e:
with open(error_path, 'a') as file:
file.write(key+'\n')
# raise e
if count % 1000 ==0 or count==50:
json.dump(test_result, open(output_path, 'w'), indent=4, ensure_ascii=False)
json.dump(test_result, open(output_path, 'w'), indent=4, ensure_ascii=False)
def main(args):
devices = list(map(int, args.devices.split(',')))
tokenizer = AutoTokenizer.from_pretrained(args.ckpt_path, revision="v2.0", use_fast=False, trust_remote_code=True)
inferset = Dataset(args)
num_devices = len(devices)
subset_size = len(inferset) // num_devices
subsets = [ Subset(inferset, range(i * subset_size, (i + 1) * subset_size)) for i in range(num_devices) ]
data_loaders = [ DataLoader(subset, batch_size=1) for subset in subsets ]
processes = []
for i in range(num_devices):
device_num = devices[i]
device = torch.device(f'cuda:{device_num}')
# device = torch.device('cpu')
p = torch.multiprocessing.Process(target=inference_on_device, args=(args, tokenizer, device, data_loaders[i]))
p.start()
processes.append(p)
for p in processes:
p.join()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--language', type=str, default = 'english')
parser.add_argument('--devices', type=str, default = '0')
parser.add_argument('--json_path', type=str, default = './example_libritts.json')
parser.add_argument('--output_path', type=str, default = './inference_libritts.json')
parser.add_argument('--error_path', type=str, default = './error.txt')
parser.add_argument('--ckpt_path', type=str, default = './finetuned-llama')
args = parser.parse_args()
main(args)
================================================
FILE: requirements.yaml
================================================
name: speechcraft
channels:
- http://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch/
- http://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/
- http://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/
- simpleitk
- pytorch
- menpo
- msys2
- nvidia
- defaults
- bioconda
- conda-forge
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=2_gnu
- bzip2=1.0.8=h5eee18b_6
- ca-certificates=2024.11.26=h06a4308_0
- ld_impl_linux-64=2.40=h12ee557_0
- libffi=3.4.4=h6a678d5_1
- libgcc=14.2.0=h77fa898_1
- libgcc-ng=14.2.0=h69a702a_1
- libgomp=14.2.0=h77fa898_1
- libnsl=2.0.1=hd590300_0
- libsqlite=3.47.0=hadc24fc_1
- libstdcxx-ng=11.2.0=h1234567_1
- libuuid=2.38.1=h0b41bf4_0
- libxcrypt=4.4.36=hd590300_1
- libzlib=1.3.1=hb9d3cd8_2
- ncurses=6.4=h6a678d5_0
- openssl=3.4.0=hb9d3cd8_0
- python=3.9.18=h0755675_1_cpython
- readline=8.2=h5eee18b_0
- setuptools=75.1.0=py39h06a4308_0
- tk=8.6.13=noxft_h4845f30_101
- wheel=0.44.0=py39h06a4308_0
- xz=5.4.6=h5eee18b_1
- pip:
- absl-py==2.1.0
- accelerate==0.28.0
- aiohappyeyeballs==2.4.4
- aiohttp==3.11.10
- aiosignal==1.3.1
- aliyun-python-sdk-core==2.16.0
- aliyun-python-sdk-kms==2.16.5
- annotated-types==0.7.0
- antlr4-python3-runtime==4.9.3
- anyio==4.7.0
- argparse==1.4.0
- asgiref==3.8.1
- async-timeout==4.0.3
- asyncio==3.4.3
- attrs==24.2.0
- audioread==3.0.1
- backoff==2.2.1
- bcrypt==4.2.1
- beartype==0.17.2
- beautifulsoup4==4.12.3
- bitarray==3.0.0
- bitsandbytes==0.43.0
- black==24.10.0
- build==1.2.2.post1
- cachetools==5.5.0
- certifi==2024.8.30
- cffi==1.17.1
- charset-normalizer==3.4.0
- chroma-hnswlib==0.7.6
- chromadb==0.5.23
- click==8.1.7
- cohere==5.1.8
- colorama==0.4.6
- coloredlogs==15.0.1
- colt5-attention==0.10.20
- contourpy==1.3.0
- crcmod==1.7
- cryptography==44.0.0
- cycler==0.12.1
- cython==3.0.11
- dalle3==0.1.0
- dataclasses-json==0.6.7
- datasets==2.19.2
- decorator==5.1.1
- deprecated==1.2.15
- diffusers==0.31.0
- dill==0.3.8
- distance==0.1.3
- docstring-parser==0.16
- duckduckgo-search==6.4.1
- durationpy==0.9
- editdistance==0.8.1
- einops==0.7.0
- einops-exts==0.0.4
- einx==0.3.0
- encodec==0.1.1
- exceptiongroup==1.2.2
- fairscale==0.4.13
- faiss-cpu==1.9.0.post1
- fastapi==0.115.6
- fastavro==1.9.7
- filelock==3.13.1
- flake8==7.1.1
- flash-attn==2.3.6
- flatbuffers==24.3.25
- fonttools==4.55.2
- frozendict==2.4.6
- frozenlist==1.5.0
- fsspec==2024.2.0
- ftfy==6.3.1
- funasr==1.1.16
- g2p-en==2.1.0
- ggl==1.1.0
- google-ai-generativelanguage==0.6.10
- google-api-core==2.24.0
- google-api-python-client==2.155.0
- google-auth==2.37.0
- google-auth-httplib2==0.2.0
- google-generativeai==0.8.3
- googleapis-common-protos==1.66.0
- greenlet==3.1.1
- grpcio==1.68.1
- grpcio-status==1.68.1
- h11==0.14.0
- httpcore==1.0.7
- httplib2==0.22.0
- httptools==0.6.4
- httpx==0.28.1
- huggingface-hub==0.26.3
- humanfriendly==10.0
- hydra-core==1.3.2
- idna==3.10
- importlib-metadata==8.5.0
- importlib-resources==6.4.5
- inflect==7.4.0
- iniconfig==2.0.0
- jaconv==0.4.0
- jamo==0.4.1
- jieba==0.42.1
- jinja2==3.1.3
- jmespath==0.10.0
- joblib==1.4.2
- jsonpatch==1.33
- jsonpointer==3.0.0
- kaldiio==2.18.0
- kiwisolver==1.4.7
- kubernetes==31.0.0
- langchain==0.1.13
- langchain-community==0.0.29
- langchain-core==0.1.53
- langchain-experimental==0.0.55
- langchain-text-splitters==0.0.2
- langsmith==0.1.147
- lazy-loader==0.4
- libcst==1.5.1
- librosa==0.10.2.post1
- lightning==2.4.0
- lightning-utilities==0.11.9
- lion-pytorch==0.2.3
- llvmlite==0.43.0
- local-attention==1.9.3
- loguru==0.7.2
- lxml==5.3.0
- markdown==3.7
- markdown-it-py==3.0.0
- markupsafe==2.1.5
- marshmallow==3.23.1
- matplotlib==3.9.3
- mccabe==0.7.0
- mdurl==0.1.2
- mmh3==5.0.1
- modelscope==1.21.0
- monotonic==1.6
- more-itertools==10.5.0
- mpmath==1.3.0
- msgpack==1.1.0
- multidict==6.1.0
- multiprocess==0.70.16
- mypy-extensions==1.0.0
- nest-asyncio==1.6.0
- networkx==3.2.1
- ninja==1.11.1.2
- nltk==3.9.1
- numba==0.60.0
- numpy==1.25.2
- nvidia-cublas-cu12==12.1.3.1
- nvidia-cuda-cupti-cu12==12.1.105
- nvidia-cuda-nvrtc-cu12==12.1.105
- nvidia-cuda-runtime-cu12==12.1.105
- nvidia-cudnn-cu12==8.9.2.26
- nvidia-cufft-cu12==11.0.2.54
- nvidia-curand-cu12==10.3.2.106
- nvidia-cusolver-cu12==11.4.5.107
- nvidia-cusparse-cu12==12.1.0.106
- nvidia-nccl-cu12==2.19.3
- nvidia-nvjitlink-cu12==12.4.127
- nvidia-nvtx-cu12==12.1.105
- oauthlib==3.2.2
- omegaconf==2.3.0
- onnxruntime==1.19.2
- open-clip-torch==2.29.0
- openai==0.28.0
- opencv-python==4.10.0.84
- opencv-python-headless==4.10.0.84
- opentelemetry-api==1.29.0
- opentelemetry-exporter-otlp-proto-common==1.29.0
- opentelemetry-exporter-otlp-proto-grpc==1.29.0
- opentelemetry-instrumentation==0.50b0
- opentelemetry-instrumentation-asgi==0.50b0
- opentelemetry-instrumentation-fastapi==0.50b0
- opentelemetry-proto==1.29.0
- opentelemetry-sdk==1.29.0
- opentelemetry-semantic-conventions==0.50b0
- opentelemetry-util-http==0.50b0
- orjson==3.10.12
- oss2==2.19.1
- outcome==1.3.0.post0
- overrides==7.7.0
- packaging==23.2
- pandas==2.2.3
- pathspec==0.12.1
- peft==0.13.2
- pillow==10.3.0
- pip==24.3.1
- platformdirs==4.3.6
- playwright==1.49.1
- pluggy==1.5.0
- pooch==1.8.2
- portalocker==3.0.0
- posthog==3.7.4
- primp==0.8.3
- propcache==0.2.1
- proto-plus==1.25.0
- protobuf==5.29.1
- psutil==6.1.0
- pyarrow==18.1.0
- pyarrow-hotfix==0.6
- pyasn1==0.6.1
- pyasn1-modules==0.4.1
- pycodestyle==2.12.1
- pycparser==2.22
- pycryptodome==3.21.0
- pydantic==2.7.1
- pydantic-core==2.18.2
- pydub==0.25.1
- pyee==12.0.0
- pyflakes==3.2.0
- pygments==2.18.0
- pynndescent==0.5.13
- pyparsing==3.2.0
- pypdf==4.1.0
- pypdf2==3.0.1
- pypika==0.48.9
- pyproject-hooks==1.2.0
- pysocks==1.7.1
- pytest==8.1.1
- python-dateutil==2.9.0.post0
- python-dotenv==1.0.1
- pytorch-lightning==2.4.0
- pytorch-wpe==0.0.1
- pytz==2024.2
- pyyaml==6.0.2
- qformer==0.0.5
- ratelimit==2.2.1
- regex==2024.11.6
- requests==2.32.3
- requests-oauthlib==2.0.0
- requests-toolbelt==1.0.0
- rich==13.7.1
- rsa==4.9
- sacrebleu==2.4.3
- safetensors==0.4.5
- scikit-learn==1.5.2
- scipy==1.9.3
- selenium==4.27.1
- sentencepiece==0.2.0
- sentry-sdk==2.19.2
- shellingham==1.5.4
- six==1.17.0
- sniffio==1.3.1
- sortedcontainers==2.4.0
- soundfile==0.12.1
- soupsieve==2.6
- soxr==0.5.0.post1
- sqlalchemy==2.0.36
- starlette==0.41.3
- swarms==2.4.0
- sympy==1.13.1
- tabulate==0.9.0
- tenacity==8.2.3
- tensorboard==2.18.0
- tensorboard-data-server==0.7.2
- tensorboardx==2.6.2.2
- termcolor==2.5.0
- threadpoolctl==3.5.0
- tiktoken==0.8.0
- timm==0.4.12
- tokenizers==0.13.3
- toml==0.10.2
- tomli==2.2.1
- torch==2.2.0
- torch-complex==0.4.4
- torchaudio==2.2.0
- torchfix==0.7.0
- torchmetrics==1.6.0
- torchvision==0.17.0
- tqdm==4.66.2
- transformers==4.33.3
- trio==0.27.0
- trio-websocket==0.11.1
- triton==2.2.0
- typeguard==4.4.1
- typer==0.15.1
- types-requests==2.32.0.20241016
- typing==3.7.4.3
- typing-extensions==4.12.2
- typing-inspect==0.9.0
- tzdata==2024.2
- umap-learn==0.5.7
- undetected-chromedriver==3.5.5
- uritemplate==4.1.1
- urllib3==2.2.3
- uvicorn==0.32.1
- uvloop==0.21.0
- vector-quantize-pytorch==1.14.5
- vocos==0.1.0
- watchfiles==1.0.3
- wcwidth==0.2.13
- websocket-client==1.8.0
- websockets==14.1
- werkzeug==3.1.3
- wget==3.2
- wrapt==1.17.0
- wsproto==1.2.0
- xxhash==3.5.0
- yarl==1.18.3
- zetascale==0.4.9
- zipp==3.21.0
gitextract_tgmekqoo/ ├── AutomaticPipeline/ │ ├── AgePreTrainModel.py │ ├── AutoPipeline.py │ ├── Clustering.py │ ├── PitchEnergy.py │ ├── models/ │ │ └── SECap/ │ │ └── model2.py │ └── outputs/ │ ├── labels_LJspeech_0.json │ └── labels_LJspeech_0.scp ├── README.md ├── llama-ft/ │ └── llama_infer.py └── requirements.yaml
SYMBOL INDEX (50 symbols across 6 files)
FILE: AutomaticPipeline/AgePreTrainModel.py
class ModelHead (line 10) | class ModelHead(nn.Module):
method __init__ (line 13) | def __init__(self, config, num_labels):
method forward (line 21) | def forward(self, features, **kwargs):
class AgeGenderModel (line 32) | class AgeGenderModel(Wav2Vec2PreTrainedModel):
method __init__ (line 35) | def __init__(self, config):
method forward (line 44) | def forward(
FILE: AutomaticPipeline/AutoPipeline.py
function to_device (line 30) | def to_device(tensors, device):
class CustomDataset (line 39) | class CustomDataset(torch.utils.data.Dataset):
method __init__ (line 40) | def __init__(
method __preprocess__ (line 58) | def __preprocess__(self):
method __len__ (line 70) | def __len__(self):
method _cutorpad (line 76) | def _cutorpad(self, audio: np.ndarray) -> np.ndarray:
method __getitem__ (line 94) | def __getitem__(self, index) -> torch.Tensor:
class CollateFunc (line 117) | class CollateFunc:
method __init__ (line 118) | def __init__(
method __call__ (line 132) | def __call__(self, batch: List):
function age_predict (line 157) | def age_predict(batch, model, device):
function gender_predict (line 165) | def gender_predict(batch, model, device):
function emotion_predict (line 176) | def emotion_predict(audiopaths, model):
function pitch_energy_calculate (line 184) | def pitch_energy_calculate(input_values):
function inference_on_device (line 195) | def inference_on_device(device, i, num_devices, language, basedir, scp_p...
function main (line 289) | def main(args):
FILE: AutomaticPipeline/Clustering.py
function assign_pitch_group (line 6) | def assign_pitch_group(row, language, male_percentiles, female_percentil...
function replace_age_with_text (line 22) | def replace_age_with_text(row, language):
function main (line 36) | def main(args):
FILE: AutomaticPipeline/PitchEnergy.py
function extract_pitch (line 4) | def extract_pitch(wav, sr):
function calculate_mean_pitch (line 8) | def calculate_mean_pitch(pitches):
function process_audio (line 11) | def process_audio(audio, sr):
FILE: AutomaticPipeline/models/SECap/model2.py
class KeywordsStoppingCriteria (line 20) | class KeywordsStoppingCriteria(StoppingCriteria):
method __init__ (line 21) | def __init__(self, keywords_ids:list):
method __call__ (line 24) | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTen...
class MotionAudio (line 29) | class MotionAudio(pl.LightningModule):
method __init__ (line 30) | def __init__(
method init_Qformer (line 76) | def init_Qformer(self,num_query_token, vision_width, cross_attention_f...
method mean_pooling (line 94) | def mean_pooling(self,model_output, attention_mask):
method forward (line 102) | def forward(self, audio, describtion):
method training_step (line 170) | def training_step(self, batch, batch_idx):
method validation_step (line 175) | def validation_step(self, batch, batch_idx):
method configure_optimizers (line 180) | def configure_optimizers(self):
method inference (line 183) | def inference(self, audio_feature):
method post_processing (line 242) | def post_processing(self, sentences,device):
method test_step (line 254) | def test_step(self, batch, batch_idx):
function count_parameters (line 272) | def count_parameters(model):
FILE: llama-ft/llama_infer.py
class Dataset (line 43) | class Dataset(torch.utils.data.Dataset):
method __init__ (line 44) | def __init__(self, args):
method __len__ (line 51) | def __len__(self):
method __getitem__ (line 54) | def __getitem__(self, index):
function extract (line 71) | def extract(input_text, language):
function inference_on_device (line 87) | def inference_on_device(args, tokenizer, device, dataloader):
function main (line 151) | def main(args):
Condensed preview — 10 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (151K chars).
[
{
"path": "AutomaticPipeline/AgePreTrainModel.py",
"chars": 1327,
"preview": "import numpy as np\nimport torch\nimport torch.nn as nn\nfrom transformers.models.wav2vec2.modeling_wav2vec2 import (\n W"
},
{
"path": "AutomaticPipeline/AutoPipeline.py",
"chars": 11761,
"preview": "# coding=utf-8\nimport os\nimport argparse\nimport numpy as np\nimport torch\nimport librosa\nfrom typing import List, Optiona"
},
{
"path": "AutomaticPipeline/Clustering.py",
"chars": 3846,
"preview": "import pandas as pd\nimport csv\nimport numpy as np\nimport argparse\nimport json\ndef assign_pitch_group(row, language, male"
},
{
"path": "AutomaticPipeline/PitchEnergy.py",
"chars": 512,
"preview": "import librosa\nimport numpy as np\n\ndef extract_pitch(wav, sr):\n pitches, magnitudes = librosa.core.piptrack(y=wav, sr"
},
{
"path": "AutomaticPipeline/models/SECap/model2.py",
"chars": 13992,
"preview": "import torch\nimport torch.nn as nn\nimport lightning.pytorch as pl\nfrom module.Qformer import BertConfig, BertLMHeadModel"
},
{
"path": "AutomaticPipeline/outputs/labels_LJspeech_0.json",
"chars": 49493,
"preview": "{\n \"LJSpeech_LJ010-0227\": {\n \"labels\": \"age:Elderly\\tgender:male\\tpitch:low\\tvolume:normal\\tspeed:fast\\temotio"
},
{
"path": "AutomaticPipeline/outputs/labels_LJspeech_0.scp",
"chars": 36520,
"preview": "LJSpeech\tLJ010-0227\t68\tmale\t968.5380249023438\t0.8162265419960022\t0.022741336633663366\tsad\t He went from Newgate first to"
},
{
"path": "README.md",
"chars": 6309,
"preview": "# SpeechCraft\n\nThis is the official repository of the ACM Multimedia 2024 paper *\"SpeechCraft: A Fine-Grained Expressive"
},
{
"path": "llama-ft/llama_infer.py",
"chars": 7269,
"preview": "import os\nimport torch\nimport argparse\nfrom accelerate import Accelerator\nfrom transformers import AutoModelForCausalLM,"
},
{
"path": "requirements.yaml",
"chars": 8578,
"preview": "name: speechcraft\nchannels:\n - http://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch/\n - http://mirrors.tuna.tsin"
}
]
About this extraction
This page contains the full source code of the thuhcsi/SpeechCraft GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 10 files (136.3 KB), approximately 43.4k tokens, and a symbol index with 50 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.