Repository: thuhcsi/SpeechCraft Branch: master Commit: 3ef02110bd7b Files: 10 Total size: 136.3 KB Directory structure: gitextract_tgmekqoo/ ├── AutomaticPipeline/ │ ├── AgePreTrainModel.py │ ├── AutoPipeline.py │ ├── Clustering.py │ ├── PitchEnergy.py │ ├── models/ │ │ └── SECap/ │ │ └── model2.py │ └── outputs/ │ ├── labels_LJspeech_0.json │ └── labels_LJspeech_0.scp ├── README.md ├── llama-ft/ │ └── llama_infer.py └── requirements.yaml ================================================ FILE CONTENTS ================================================ ================================================ FILE: AutomaticPipeline/AgePreTrainModel.py ================================================ import numpy as np import torch import torch.nn as nn from transformers.models.wav2vec2.modeling_wav2vec2 import ( Wav2Vec2Model, Wav2Vec2PreTrainedModel, ) from torch.nn import functional as F class ModelHead(nn.Module): r"""Classification head.""" def __init__(self, config, num_labels): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dropout = nn.Dropout(config.final_dropout) self.out_proj = nn.Linear(config.hidden_size, num_labels) def forward(self, features, **kwargs): x = features x = self.dropout(x) x = self.dense(x) x = torch.tanh(x) x = self.dropout(x) x = self.out_proj(x) return x class AgeGenderModel(Wav2Vec2PreTrainedModel): r"""Speech emotion classifier.""" def __init__(self, config): super().__init__(config) self.config = config self.wav2vec2 = Wav2Vec2Model(config) self.age = ModelHead(config, 1) self.init_weights() def forward( self, input_values, ): outputs = self.wav2vec2(input_values) hidden_states = outputs[0] hidden_states = torch.mean(hidden_states, dim=1) logits_age = self.age(hidden_states) return logits_age ================================================ FILE: AutomaticPipeline/AutoPipeline.py ================================================ # coding=utf-8 import os import argparse import numpy as np import torch import librosa from typing import List, Optional, Union, Dict from tqdm import tqdm import torchaudio from torch.utils.data import DataLoader from torch.nn import functional as F from transformers import ( AutoModelForAudioClassification, Wav2Vec2Processor, LlamaTokenizer, AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline ) from funasr import AutoModel import sys sys.path.append('../SECap') # path to the directory of SECap from AgePreTrainModel import AgeGenderModel from model2 import MotionAudio from PitchEnergy import process_audio from g2p_en import G2p torch.multiprocessing.set_start_method('spawn', force=True) device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float32 def to_device(tensors, device): tensors_to_device = [] for tensor in tensors: if isinstance(tensor, torch.Tensor): tensors_to_device.append(tensor.to(device)) else: tensors_to_device.append(tensor) return tensors_to_device class CustomDataset(torch.utils.data.Dataset): def __init__( self, basedir: Optional[str] = None, sampling_rate: int = 16000, max_audio_len: int = 5, number: int=0, num_devices: int=4 ): self.num_devices = num_devices self.number = number self.basedir = basedir self.sampling_rate = sampling_rate self.max_audio_len = max_audio_len self.dataset = [] self.category = [] self.text_tn = [] self.__preprocess__() def __preprocess__(self): paths = [] for dirname, subdirs, files in os.walk(self.basedir): for filename in files: if filename.startswith('.'): continue if filename.endswith('.wav'): path = os.path.join(dirname, filename) paths.append(path) subset_size = len(paths) // self.num_devices self.dataset = paths[self.number * subset_size: (self.number+1) * subset_size] def __len__(self): """ Return the length of the dataset """ return len(self.dataset) def _cutorpad(self, audio: np.ndarray) -> np.ndarray: """ Cut or pad audio to the wished length """ effective_length = self.sampling_rate * self.max_audio_len len_audio = len(audio) # If audio length is bigger than wished audio length if len_audio > effective_length: audio = audio[:effective_length] elif len_audio < effective_length: audio_feature_tensor = torch.zeros(1, effective_length) audio_feature_tensor[:len(audio)] = audio audio = audio_feature_tensor # Expand one dimension related to the channel dimension return audio def __getitem__(self, index) -> torch.Tensor: """ Return the audio and the sampling rate """ filepath = self.dataset[index] speech_array, sr = librosa.load(filepath) if len(speech_array)==0: return None speech_array = speech_array[:self.max_audio_len * sr] speech_array = torch.Tensor(speech_array).unsqueeze(0) # Transform to mono if speech_array.shape[0] > 1: speech_array = torch.mean(speech_array, dim=0, keepdim=True) if sr != self.sampling_rate: transform = torchaudio.transforms.Resample(sr, self.sampling_rate) speech_array = transform(speech_array) speech_array = speech_array.squeeze().numpy() return speech_array, filepath class CollateFunc: def __init__( self, w2v_processor: Wav2Vec2Processor, max_length: Optional[int] = None, padding: Union[bool, str] = True, pad_to_multiple_of: Optional[int] = None, sampling_rate: int = 16000, ): self.padding = padding self.w2v_processor = w2v_processor self.max_length = max_length self.sampling_rate = sampling_rate self.pad_to_multiple_of = pad_to_multiple_of def __call__(self, batch: List): audios = [] input_features = [] audiopaths = [] durations = [] for audio, audiopath in batch: audios.append(audio) audiopaths.append(audiopath) input_tensor = self.w2v_processor(audio, sampling_rate=self.sampling_rate).input_values input_tensor = np.squeeze(input_tensor) input_features.append({"input_values": input_tensor}) durations.append(len(audio) / self.sampling_rate) batch = self.w2v_processor.pad( input_features, padding=self.padding, max_length=self.max_length, pad_to_multiple_of=self.pad_to_multiple_of, return_tensors="pt", ) return batch, audiopaths, durations def age_predict(batch, model, device): r"""Predict age from raw audio signal.""" audios = batch.to(device) preds = model(audios) preds = preds.detach().cpu().numpy() ages = [int(i*100) for i in preds] return ages def gender_predict(batch, model, device): r"""Predict gender from raw audio signal.""" G = ['female', 'male'] input_values, attention_mask = batch['input_values'].to(device), batch['attention_mask'].to(device) logits = model(input_values, attention_mask=attention_mask).logits scores = F.softmax(logits, dim=-1) pred = torch.argmax(scores, dim=1).cpu().detach().numpy() genders = [G[pred[i]] for i in range(len(pred))] return genders # def emotion_predict(batch, model, device, feature_extractor): def emotion_predict(audiopaths, model): emotionlabels = ['angry', 'disgusted', 'fearful', 'happy', 'neutral', 'neutral', 'sad', 'surprised', 'neutral'] results = model.generate(audiopaths, granularity="utterance", extract_embedding=False) scores = [result['scores'] for result in results] emotion_indexs = [score.index(max(score)) for score in scores] emotions = [emotionlabels[emotion_index] for emotion_index in emotion_indexs] return emotions def pitch_energy_calculate(input_values): r"""Predict pitch and energy from raw audio signal.""" pitchs = [] energys = [] input_values = input_values.cpu().detach().numpy() for audio in input_values: mean_pitch, mean_energy = process_audio(audio, sr=16000) pitchs.append(mean_pitch) energys.append(mean_energy) return pitchs, energys def inference_on_device(device, i, num_devices, language, basedir, scp_path): sampling_rate = 16000 batch_size = 4 gender_model_path = "alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech" age_model_path = "audeering/wav2vec2-large-robust-24-ft-age-gender" asr_path = "openai/whisper-medium" # asr_path = "openai/whisper-large-v3" scp_path = scp_path[:-4]+'_'+str(i)+'.scp' # Gender Predict gender_model = AutoModelForAudioClassification.from_pretrained( pretrained_model_name_or_path = gender_model_path, num_labels = 2, label2id = { "female": 0, "male": 1 }, id2label = { 0: "female", 1: "male" }, ) gender_model.to(device) gender_model.eval() # Age Predict w2v_processor = Wav2Vec2Processor.from_pretrained(age_model_path) age_model = AgeGenderModel.from_pretrained(age_model_path, use_auth_token=False) age_model.to(device) #Emotion Predict if language == 'english': emotion_model = AutoModel(model="./models/emotion2vec_base_finetuned", model_revision="v2.0.4") else: emotion_model= MotionAudio() llama_ckpt = "../SECap/weights/models--minlik--chinese-llama-7b-merged/snapshots/1ca4d87576f1fef4d44a949fb65bbe6b96675872" llama_tokenizer = LlamaTokenizer.from_pretrained(llama_ckpt) ckpt_path="../SECap/model.ckpt" torch.cuda.empty_cache() state_dict = torch.load(ckpt_path, map_location=torch.device('cpu')) emotion_model.load_state_dict(state_dict, strict=False) emotion_model.to(device) g2p_model = G2p() # ASR asr_processor = AutoProcessor.from_pretrained(asr_path) asr_model = AutoModelForSpeechSeq2Seq.from_pretrained(asr_path) asr_model.to(device) asr_pipe = pipeline( "automatic-speech-recognition", model=asr_model, tokenizer=asr_processor.tokenizer, feature_extractor=asr_processor.feature_extractor, max_new_tokens=128, chunk_length_s=30, batch_size=batch_size, return_timestamps=False, torch_dtype=torch_dtype, device=device, ) inferset = CustomDataset(basedir, sampling_rate = sampling_rate, number = i, num_devices= num_devices) data_collator = CollateFunc( w2v_processor=w2v_processor, padding=True, sampling_rate=16000, ) test_dataloader = DataLoader( dataset=inferset, batch_size=batch_size, collate_fn=data_collator, shuffle=False, num_workers=0 ) with torch.no_grad(): for s, load_data in enumerate(tqdm(test_dataloader)): audios, audiopaths, durations = to_device(load_data, device) audio_features = audios['input_values'] transcripts = [asr_pipe(audio_features.numpy()[i], return_timestamps=False, generate_kwargs={"language": language})["text"] for i in range(len(audiopaths))] ages = age_predict(audio_features, model=age_model, device=device) genders = gender_predict(audios, model=gender_model, device=device) pitchs, energys = pitch_energy_calculate(audio_features) if language == 'english': phonemes = [g2p_model(transcripts) for i in range(len(audiopaths))] speeds = [durations[i] / len(phonemes[i] ) for i in range(len(audiopaths))] emotions = emotion_predict(audiopaths, model=emotion_model) else: speeds = [durations[i] / len(transcripts[i] ) for i in range(len(audiopaths))] prompts = emotion_model.inference(audio_features.to(device)) emotions = llama_tokenizer.batch_decode(prompts,skip_special_tokens=True) with open(scp_path, 'a', encoding='utf-8') as file: for i in range(len(audiopaths)): file.write(f"{audiopaths[i].split('/')[-2]}\t{audiopaths[i].split('/')[-1][:-4]}\t{ages[i]}\t{genders[i]}\t{pitchs[i]}\t{energys[i]}\t{speeds[i]}\t{emotions[i]}\t{transcripts[i]}\n") def main(args): language = args.language basedir = args.basedir devices = list(map(int, args.devices.split(','))) num_devices = len(devices) scp_path = args.scp_path processes = [] for i in range(num_devices): device_num = devices[i] device = torch.device(f'cuda:{device_num}') p = torch.multiprocessing.Process(target=inference_on_device, args=(device, i, num_devices, language, basedir, scp_path)) p.start() processes.append(p) for p in processes: p.join() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--language', type=str, default = 'chinese') parser.add_argument('--devices', type=str, default = '0') parser.add_argument('--basedir', type=str, default = '../FastSpeech2/raw_data/AISHELL3/SSB0005') parser.add_argument('--scp_path', type=str, default = './outputs/labels_AISHELL.scp') args = parser.parse_args() main(args) ================================================ FILE: AutomaticPipeline/Clustering.py ================================================ import pandas as pd import csv import numpy as np import argparse import json def assign_pitch_group(row, language, male_percentiles, female_percentiles): if row['gender'] == 'male': if row['pitch'] <= male_percentiles[0]: return 'low' if language == 'en' else '低' elif row['pitch'] <= male_percentiles[1]: return 'normal' if language == 'en' else '中' else: return 'high' if language == 'en' else '高' else: if row['pitch'] <= female_percentiles[0]: return 'low' if language == 'en' else '低' elif row['pitch'] <= female_percentiles[1]: return 'normal' if language == 'en' else '中' else: return 'high' if language == 'en' else '高' def replace_age_with_text(row, language): age = row['age'] if age < 14: return "Child" if language == 'en' else '小孩' elif age < 26: return "Teenager" if language == 'en' else '少年' elif age < 40: return "Young Adult" if language == 'en' else '青年' elif age < 55: return "Middle-aged" if language == 'en' else '中年' else: return "Elderly" if language == 'en' else '老年' def main(args): input_path = args.input_path language = args.language df = pd.read_csv(input_path, encoding = 'utf-8', sep='\t', header=None, quoting=csv.QUOTE_NONE) df.columns = ['filename1', 'filename2', 'age', 'gender', 'pitch', 'energy', 'speed', 'emotion', 'transcript'] df = df.dropna() # df = df.dropna(subset=['pitch']) male_percentiles = np.percentile(df[df['gender'] == 'male']['pitch'], [40, 90]) female_percentiles = np.percentile(df[df['gender'] == 'female']['pitch'], [10, 60]) df['age'] = df.apply(replace_age_with_text, axis=1, language = language) df['pitch_group'] = df.apply(assign_pitch_group, axis=1, language = language, male_percentiles = male_percentiles, female_percentiles = female_percentiles) df['energy_group'] = pd.qcut(df['energy'], 3, labels=["low", "normal", "high"] if language=='en' else ["低", "中", "高"]) df['speed_group'] = pd.qcut(df['speed'], 3, labels=["fast", "normal", "slow"] if language=='en' else ["快", "中", "慢"]) df_to_save = df[['filename1', 'filename2', 'age', 'gender', 'pitch_group', 'energy_group', 'speed_group', 'emotion', 'transcript']] df_to_save.to_csv(input_path.replace('.scp', '_clusterd.scp'), sep='\t', header=0, index=False, quoting=csv.QUOTE_NONE) result_dict = {} for index, row in df_to_save.iterrows(): key = f"{row['filename1']}_{row['filename2']}" if language == 'en': value = ( f"age:{row['age']}\t" f"gender:{row['gender']}\t" f"pitch:{row['pitch_group']}\t" f"volume:{row['energy_group']}\t" f"speed:{row['speed_group']}\t" f"emotion:{row['emotion']}\t" f"transcription:{row['transcript']}" ) else: value = ( f"语气:{row['emotion']}\t" f"年龄:{row['age']}\t" f"性别:{row['gender']}\t" f"音高:{row['pitch_group']}\t" f"音量:{row['energy_group']}\t" f"语速:{row['speed_group']}" f"文本:{row['transcript']}\t" ) result_dict[key] = {} result_dict[key]['labels'] = value with open(input_path.replace('.scp', '.json'), 'w', encoding='utf-8') as json_file: json.dump(result_dict, json_file, ensure_ascii=False, indent=4) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--language', type=str, default = 'en') parser.add_argument('--input_path', type=str, default = './outputs/labels_LJspeech_0.scp') args = parser.parse_args() main(args) ================================================ FILE: AutomaticPipeline/PitchEnergy.py ================================================ import librosa import numpy as np def extract_pitch(wav, sr): pitches, magnitudes = librosa.core.piptrack(y=wav, sr=sr) return pitches def calculate_mean_pitch(pitches): return np.mean([np.mean(p[p > 0]) for p in pitches.T if np.sum(p > 0) > 0]) def process_audio(audio, sr): pitches = extract_pitch(audio, sr = sr) mean_pitch = calculate_mean_pitch(pitches) energy = librosa.feature.rms(y=audio) mean_energy = np.mean(energy[~np.isnan(energy)]) return mean_pitch, mean_energy ================================================ FILE: AutomaticPipeline/models/SECap/model2.py ================================================ import torch import torch.nn as nn import lightning.pytorch as pl from module.Qformer import BertConfig, BertLMHeadModel from transformers import ( Wav2Vec2FeatureExtractor, HubertModel, BertTokenizer, BertModel, LlamaTokenizer ) from module.modeling_llama import LlamaForCausalLM from CLUB_modules.mi_estimators import * from tool.get_sentence_simi import SimiCal import torch.nn.functional as F from transformers import StoppingCriteria, StoppingCriteriaList import numpy as np import os class KeywordsStoppingCriteria(StoppingCriteria): def __init__(self, keywords_ids:list): self.keywords = keywords_ids def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: if input_ids[0][-1] in self.keywords: return True return False class MotionAudio(pl.LightningModule): def __init__( self, hubert_ckpt="weights/models--TencentGameMate--chinese-hubert-large/snapshots/90cb660492214f687e60f5ca509b20edae6e75bd", text2vec_ckpt="weights/models--shibing624--text2vec-base-chinese/snapshots/26420fdf61ddfd92fafbaf3bc21a7c06b1812248", llama_ckpt="weights/models--minlik--chinese-llama-7b-merged/snapshots/1ca4d87576f1fef4d44a949fb65bbe6b96675872"): super(MotionAudio,self).__init__() #path current_directory = os.path.dirname(os.path.abspath(__file__)) hubert_ckpt = os.path.join(current_directory, hubert_ckpt) text2vec_ckpt = os.path.join(current_directory, text2vec_ckpt) llama_ckpt = os.path.join(current_directory, llama_ckpt) #hubert self.hubert_model=HubertModel.from_pretrained(hubert_ckpt) self.hubert_feature_extractor=Wav2Vec2FeatureExtractor.from_pretrained(hubert_ckpt) #text2vec self.text2vec_model=BertModel.from_pretrained(text2vec_ckpt) self.text2vec_tokenizer=BertTokenizer.from_pretrained(text2vec_ckpt) #llama self.llama_model=LlamaForCausalLM.from_pretrained(llama_ckpt, torch_dtype="auto") #self.llama_model = self.llama_model.to(torch.float32) self.llama_tokenizer=LlamaTokenizer.from_pretrained(llama_ckpt) if self.llama_tokenizer.pad_token_id is None: self.llama_tokenizer.pad_token = self.llama_tokenizer.unk_token #self.llama_model.model.resize_token_embeddings(len(self.llama_tokenizer)) for p in self.parameters(): p.requires_grad = False #Qformer self.audio_Qformer,self.audio_query_tokens=self.init_Qformer(num_query_token=32,vision_width=768) self.audio_Qformer.cls = None self.audio_Qformer.bert.embeddings.word_embeddings = None self.audio_Qformer.bert.embeddings.position_embeddings = None for layer in self.audio_Qformer.bert.encoder.layer: layer.output = None layer.intermediate = None self.audio_project=nn.Linear(1024,768) self.audio_llama_project=nn.Linear(768,4096) def init_Qformer(self,num_query_token, vision_width, cross_attention_freq=2): path=os.path.dirname(os.path.abspath(__file__)) config_path=os.path.join(path,"weights/models--bert-base-chinese/snapshots/8d2a91f91cc38c96bb8b4556ba70c392f8d5ee55") encoder_config = BertConfig.from_pretrained(config_path) encoder_config.encoder_width = vision_width # insert cross-attention layer every other block encoder_config.add_cross_attention = True encoder_config.cross_attention_freq = cross_attention_freq encoder_config.query_length = num_query_token Qformer = BertLMHeadModel(config=encoder_config) ckpt=os.path.join(path,"weights/models--bert-base-chinese/snapshots/8d2a91f91cc38c96bb8b4556ba70c392f8d5ee55/pytorch_model.bin") Qformer.load_state_dict(torch.load(ckpt),strict=False) query_tokens = nn.Parameter( torch.zeros(1, num_query_token, encoder_config.hidden_size) ) query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range) return Qformer, query_tokens def mean_pooling(self,model_output, attention_mask): token_embeddings = model_output[0] # First element of model_output contains all token embeddings input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) def forward(self, audio, describtion): #hubert with torch.no_grad(): audio_feature=self.hubert_feature_extractor(audio, padding=True,return_tensors="pt",sampling_rate=16000).input_values.to(self.device) audio_feature = audio_feature.half() audio_feature=self.hubert_model(audio_feature).last_hidden_state audio_feature=self.audio_project(audio_feature) #text2vec with torch.no_grad(): #describtion describtion=[s+"" for s in describtion] describtion_input=self.text2vec_tokenizer(describtion, padding=True, truncation=True, return_tensors='pt').to(self.device) describtion_feature=self.text2vec_model(**describtion_input) describtion_feature=self.mean_pooling(describtion_feature,describtion_input['attention_mask']).unsqueeze(1) #Qformer audio_query_tokens=self.audio_query_tokens.expand(audio_feature.shape[0], -1, -1) frame_atts = torch.ones(audio_feature.size()[:-1], dtype=torch.long).to(audio_feature.device) #print(audio_query_tokens.shape,audio_feature.shape,frame_atts.shape) audio_query_output=self.audio_Qformer.bert( query_embeds=audio_query_tokens, #[32,768] encoder_hidden_states=audio_feature, encoder_attention_mask=frame_atts, return_dict=True, ) audio_hidden=audio_query_output.last_hidden_state text_tokens=self.llama_tokenizer(describtion, padding="longest", truncation=True, return_tensors='pt',add_special_tokens=False).to(self.device) #print(audio_hidden.shape) audio_input=self.audio_llama_project(audio_hidden) batchsize=audio_input.shape[0] bos=torch.ones([batchsize, 1],dtype=text_tokens.input_ids.dtype).to(self.device) * self.llama_tokenizer.bos_token_id bos_embeds=self.llama_model.model.embed_tokens(bos.to(self.device)) #in training, we use different prompts for each audio prompts=[ "请用一句话用中文表述音频中说话人的情感状态:", "请用一句中文概括音频中讲话者的情感:", "请用一句中文简述音频里说话者的情感表现:", "请用一句中文概述所给音频中说话人的情感:", "请用一句话用中文描述音频中说话人的情感:", "请用一句中文描绘音频中说话者的情感:", "请用一句中文描述所给音频中说话人的情感:", "请用一句中文简要表述音频中说话人的情感:", "请用一句中文概括所给音频中说话者的情感:", "请用一句话用中文描述所给音频中说话人的情感:", "请用一句中文简述所给音频里说话者的情感:", "请用一句中文描述音频中讲话者的情感:", "请用一句中文概述音频中说话人的情感:", "请用一句话用中文表达音频中说话者的情感:", "请用一句中文简要描述音频中说话人的情感:", "请用一句中文概括音频中说话人的情感:", "请用一句中文描述所给音频中讲话者的情感:", "请用一句中文简述音频中说话者的情感:", "请用一句中文概述所给音频中讲话者的情感:", "请用一句话用中文描述音频中讲话者的情感:", "请用一句中文描述音频中说话人的情感状态:", "请用一句中文概括所给音频里说话者的情感:", "请用一句中文简述所给音频中说话人的情感表现:", "请用一句中文概述音频里说话者的情感:", "请用一句话用中文描述音频中说话人的情感表现:", "请用一句中文描绘所给音频中说话者的情感:", "请用一句中文描述音频里讲话者的情感:", "请用一句中文简要表述所给音频中说话人的情感:", "请用一句中文概括音频里说话者的情感:", "请用一句话用中文描述所给音频中讲话者的情感:" ] import random prompt=prompts[random.randint(0,len(prompts)-1)] prompts_id=self.llama_tokenizer(prompt,return_tensors='pt').input_ids.to(self.device) prompts_id=prompts_id.expand(batchsize,-1) prompts_embeds=self.llama_model.model.embed_tokens(prompts_id) targets=text_tokens.input_ids.masked_fill( text_tokens.input_ids==self.llama_tokenizer.pad_token_id,-100 ) text_embeds=self.llama_model.model.embed_tokens(text_tokens.input_ids.to(self.device)) input_embeds=torch.cat([bos_embeds,audio_input,prompts_embeds,text_embeds],dim=1) atts_audio=torch.ones(audio_input.size()[:-1], dtype=torch.long).to(audio_input.device) #atts_audio=atts_audio.to(self.device) attns_text=text_tokens.attention_mask attns_bos=atts_audio[:,:1] attns_prompt=torch.ones(prompts_embeds.size()[:-1], dtype=torch.long).to(prompts_embeds.device) attns=torch.cat([attns_bos,atts_audio,attns_prompt,attns_text],dim=1) print(input_embeds.shape,attns.shape,targets.shape) outputs=self.llama_model( inputs_embeds=input_embeds, attention_mask=attns, labels=targets, return_dict=True, ) loss=outputs.loss #print(loss) return loss def training_step(self, batch, batch_idx): audio, describtion,_=batch loss=self.forward(audio, describtion) self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True,batch_size=len(audio),sync_dist=True) return loss def validation_step(self, batch, batch_idx): audio, describtion,_=batch loss=self.forward(audio, describtion) self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True,batch_size=len(audio),sync_dist=True) return loss def configure_optimizers(self): optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, self.parameters()), lr=0.000013, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-6) return optimizer def inference(self, audio_feature): with torch.no_grad(): audio_feature=self.hubert_model(audio_feature).last_hidden_state audio_feature=self.audio_project(audio_feature) #Qformer audio_query_tokens=self.audio_query_tokens.expand(audio_feature.shape[0], -1, -1) frame_atts = torch.ones(audio_feature.size()[:-1], dtype=torch.long).to(audio_feature.device) audio_query_output=self.audio_Qformer.bert( query_embeds=audio_query_tokens, #[32,768] encoder_hidden_states=audio_feature, encoder_attention_mask=frame_atts, return_dict=True, ) audio_hidden=audio_query_output.last_hidden_state #print(audio_hidden.shape) audio_input=self.audio_llama_project(audio_hidden) batchsize=audio_input.shape[0] #in inference, we use the same prompt for all audio #prompts=[ "请用一句话用中文表述音频中说话人的情感状态:", "请用一句中文概括音频中讲话者的情感:", "请用一句中文简述音频里说话者的情感表现:", "请用一句中文概述所给音频中说话人的情感:", "请用一句话用中文描述音频中说话人的情感:", "请用一句中文描绘音频中说话者的情感:", "请用一句中文描述所给音频中说话人的情感:", "请用一句中文简要表述音频中说话人的情感:", "请用一句中文概括所给音频中说话者的情感:", "请用一句话用中文描述所给音频中说话人的情感:", "请用一句中文简述所给音频里说话者的情感:", "请用一句中文描述音频中讲话者的情感:", "请用一句中文概述音频中说话人的情感:", "请用一句话用中文表达音频中说话者的情感:", "请用一句中文简要描述音频中说话人的情感:", "请用一句中文概括音频中说话人的情感:", "请用一句中文描述所给音频中讲话者的情感:", "请用一句中文简述音频中说话者的情感:", "请用一句中文概述所给音频中讲话者的情感:", "请用一句话用中文描述音频中讲话者的情感:", "请用一句中文描述音频中说话人的情感状态:", "请用一句中文概括所给音频里说话者的情感:", "请用一句中文简述所给音频中说话人的情感表现:", "请用一句中文概述音频里说话者的情感:", "请用一句话用中文描述音频中说话人的情感表现:", "请用一句中文描绘所给音频中说话者的情感:", "请用一句中文描述音频里讲话者的情感:", "请用一句中文简要表述所给音频中说话人的情感:", "请用一句中文概括音频里说话者的情感:", "请用一句话用中文描述所给音频中讲话者的情感:" ] prompt="请用一句中文简述音频里说话者的情感表现:" #import random #prompt=prompts[random.randint(0,len(prompts)-1)] prompts_id=self.llama_tokenizer(prompt,return_tensors='pt').input_ids.to(self.device) prompts_id=prompts_id.expand(batchsize,-1) prompts_embeds=self.llama_model.model.embed_tokens(prompts_id) bos=torch.ones([batchsize, 1],dtype=torch.int64).to(self.device) * self.llama_tokenizer.bos_token_id bos_embeds=self.llama_model.model.embed_tokens(bos.to(self.device)) embeds=torch.cat([bos_embeds,audio_input,prompts_embeds],dim=1) #print(embeds.dtype) embeds=embeds.half() with torch.no_grad(): outputs=self.llama_model.generate( inputs_embeds=embeds, max_new_tokens=50, min_new_tokens=3, do_sample=True, top_k=10, top_p=0.95, num_beams=5, repetition_penalty=10.0, pad_token_id=self.llama_tokenizer.pad_token_id, eos_token_id=self.llama_tokenizer.eos_token_id, #stopping_criteria=stopping_criteria, early_stopping=True, num_return_sequences=1, no_repeat_ngram_size=2, ) # print(output_tokens) return outputs def post_processing(self, sentences,device): similarities = np.zeros((len(sentences), len(sentences))) simi_cal=SimiCal(device=device) for i in range(len(sentences)): for j in range(len(sentences)): similarities[i, j] = simi_cal(sentences[i], sentences[j]) avg_similarities = np.mean(similarities, axis=1) least_related_indices=avg_similarities.argsort()[:3] remaining_sentences = [sentences[i] for i in range(len(sentences)) if i not in least_related_indices] return remaining_sentences def test_step(self, batch, batch_idx): audio,_,describtion,fpath=batch output_tokens,prompt=self.inference(audio) path=os.path.dirname(os.path.abspath(__file__)) test_file="result/result_1.txt" test_file=os.path.join(path,test_file) with open(test_file,"a",encoding="utf-8") as f: f.write("file: "+fpath[0]+"\n") #f.write("prompt: "+prompt+"\n") f.write("origin: "+describtion[0]+"\n") f.write("result: "+output_tokens[0]+"\n") f.write("result2: "+output_tokens[1]+"\n") f.write("result3: "+output_tokens[2]+"\n") f.write("result4: "+output_tokens[3]+"\n") f.write("result5: "+output_tokens[4]+"\n") f.write("\n") def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) if __name__ == "__main__": model=MotionAudio() print(count_parameters(model)) ================================================ FILE: AutomaticPipeline/outputs/labels_LJspeech_0.json ================================================ { "LJSpeech_LJ010-0227": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:fast\temotion:sad\ttranscription: He went from Newgate first to Bethlehem, from which he was removed to Broadmoor." }, "LJSpeech_LJ050-0106": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: to devise a practical system which has any reasonable possibility of revealing" }, "LJSpeech_LJ043-0140": { "labels": "age:Elderly\tgender:male\tpitch:high\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: He also studied Dallas bus schedules to prepare for his later use of" }, "LJSpeech_LJ018-0032": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:sad\ttranscription: There was no mystery about his departure. He had gone to Canada by the Victoria" }, "LJSpeech_LJ012-0117": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: By and by the occupant of the room noticed something glittering in the center of the fire." }, "LJSpeech_LJ038-0057": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: Deputy Sheriff Walther's brought a shotgun into the theater, but laid it on some" }, "LJSpeech_LJ017-0158": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:fast\temotion:happy\ttranscription: She had a little fortune of her own, some 1,700 pounds" }, "LJSpeech_LJ026-0094": { "labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: This however is probably not a source of vital energy, but only contributes to the" }, "LJSpeech_LJ045-0152": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:happy\ttranscription: The events of that evening can best be appreciated through Marina Oswald's testimony." }, "LJSpeech_LJ019-0264": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: and it was decidedly of opinion that in all short sentences the hard labor" }, "LJSpeech_LJ040-0047": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: That associate did not think that Oswald was a communist." }, "LJSpeech_LJ012-0015": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: Weedon and LaCassar to 12 and 6 months respectively in cold baths." }, "LJSpeech_LJ019-0097": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:sad\ttranscription: The want of uniformity in prison discipline became air long and acknowledged evil." }, "LJSpeech_LJ039-0093": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:normal\temotion:neutral\ttranscription: at a distance of 265.3 feet was, quote," }, "LJSpeech_LJ018-0028": { "labels": "age:Elderly\tgender:female\tpitch:high\tvolume:normal\tspeed:normal\temotion:sad\ttranscription: who had been a lodger of his. Muehler had given the cabman's little daughter" }, "LJSpeech_LJ028-0287": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:normal\temotion:angry\ttranscription: but found none by which he could hope to prevail unless he maimed himself." }, "LJSpeech_LJ024-0117": { "labels": "age:Elderly\tgender:female\tpitch:low\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: You will find that many of those who pretend to support you will sabotage your plans." }, "LJSpeech_LJ029-0172": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: The two Dallas newspapers provided their readers with a steady stream of information and" }, "LJSpeech_LJ024-0044": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:angry\ttranscription: that I will appoint justices who will act as justices and not as" }, "LJSpeech_LJ045-0246": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:normal\temotion:sad\ttranscription: He sought for himself a place in history, a role as the great" }, "LJSpeech_LJ042-0191": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: While, quote, resourcefulness and patient working towards the aforesaid goal" }, "LJSpeech_LJ050-0099": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: It is apparent that a good deal of further consideration and experimentation" }, "LJSpeech_LJ017-0101": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:slow\temotion:neutral\ttranscription: That day, Palmer had bought more strychnia and had called in a fresh doctor." }, "LJSpeech_LJ016-0388": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:happy\ttranscription: A few go further and are almost gluttonous." }, "LJSpeech_LJ007-0129": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:normal\temotion:angry\ttranscription: the sane and the insane, the young and the old, the trivial offender and the" }, "LJSpeech_LJ014-0195": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: Two of them supported Cope, who was still alive, although insensible, and Marley" }, "LJSpeech_LJ027-0056": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:fast\temotion:happy\ttranscription: The relation of skeleton and muscle in arthropods is exactly the reverse." }, "LJSpeech_LJ030-0227": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: Special Agent George W. Hickey Jr. in the rear seat of the Presidential" }, "LJSpeech_LJ047-0210": { "labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: I think all of those, if we had them all together..." }, "LJSpeech_LJ048-0228": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: and others who were present say that no agent was inebriated or acted improperly." }, "LJSpeech_LJ009-0104": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: The women set up a yell, which is mixed with a rustling noise, occasioned by the" }, "LJSpeech_LJ017-0085": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: Palmer's plan was to administer poison in quantities insufficient to" }, "LJSpeech_LJ018-0275": { "labels": "age:Elderly\tgender:male\tpitch:high\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: On Tarpey's defense, it was stated that the idea of the theft had been suggested" }, "LJSpeech_LJ027-0064": { "labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: All vertebrates and none other have two cavities." }, "LJSpeech_LJ036-0084": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: Craig also claimed that when Fritz pointed out to Oswald that Craig had identified" }, "LJSpeech_LJ040-0211": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: This would make him approximately ten, well almost eleven years old." }, "LJSpeech_LJ032-0241": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:slow\temotion:happy\ttranscription: Ten days prior to the Walker attempt, Oswald had undoubtedly received the rifle." }, "LJSpeech_LJ002-0158": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: Other cases are recorded elsewhere, as at the Gilsburg Street Comptor, where" }, "LJSpeech_LJ050-0077": { "labels": "age:Elderly\tgender:male\tpitch:high\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: and in taking preventive steps." }, "LJSpeech_LJ032-0236": { "labels": "age:Elderly\tgender:male\tpitch:high\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: By checking the actual mailing dates of these issues and the time it usually takes" }, "LJSpeech_LJ026-0102": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:happy\ttranscription: but root pressure due to osmosis, capillary action and evaporation." }, "LJSpeech_LJ025-0057": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: such as the charai, are in constant and regular motion, was made out" }, "LJSpeech_LJ018-0270": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: The assistant called with the jewels on approbation at a house specially hired for" }, "LJSpeech_LJ013-0021": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:slow\temotion:sad\ttranscription: The larboard pump was suffered to remain choked up and the longboat was" }, "LJSpeech_LJ042-0050": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:sad\ttranscription: I have been a pro-communist for years and yet I have never met a communist." }, "LJSpeech_LJ006-0075": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:happy\ttranscription: He saw certain rooms fill up, and yet took no steps to open others that were locked" }, "LJSpeech_LJ048-0187": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: In addition, Secret Service agents riding in the motorcade were trained to" }, "LJSpeech_LJ022-0186": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:happy\ttranscription: Twenty years of experience with this system have justified the efforts made to" }, "LJSpeech_LJ013-0078": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: Barbara was subsequently pardoned, but was not replaced on the roles as an attorney." }, "LJSpeech_LJ006-0015": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: one which required discretion, judgment and knowledge of law, with sufficient" }, "LJSpeech_LJ048-0061": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: Under proper procedures, knowledge of the pending Presidential visit might have prompted" }, "LJSpeech_LJ015-0190": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:sad\ttranscription: that of dishonest rogues who assume piety and philanthropy as a cloak for the world." }, "LJSpeech_LJ003-0115": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: The judge sat in proper form. He was punctiliously styled, my" }, "LJSpeech_LJ045-0244": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:angry\ttranscription: Long before the assassination, he expressed his hatred for American society." }, "LJSpeech_LJ004-0051": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:happy\ttranscription: When in Belgium he had examined with great satisfaction the admirable manner" }, "LJSpeech_LJ011-0265": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:sad\ttranscription: This Mr. Canning had left his widow a life interest in two thousand pounds." }, "LJSpeech_LJ028-0087": { "labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: Such was the appearance of the builder of the walls of Babylon." }, "LJSpeech_LJ028-0178": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:slow\temotion:happy\ttranscription: The walls of Babylon were so long and wide and high that all who" }, "LJSpeech_LJ011-0113": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:slow\temotion:sad\ttranscription: He met his death with unshaken firmness, only in treating that a certain blue" }, "LJSpeech_LJ039-0162": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:slow\temotion:sad\ttranscription: In an effort to test the rifle under conditions which simulated those which prevailed during" }, "LJSpeech_LJ045-0066": { "labels": "age:Middle-aged\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:sad\ttranscription: Oswald struck his wife on occasion." }, "LJSpeech_LJ028-0277": { "labels": "age:Middle-aged\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:surprised\ttranscription: One of his Sumter mules gave birth to a foal." }, "LJSpeech_LJ048-0197": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: I then told the officers that their primary duty was traffic and crowd control and that" }, "LJSpeech_LJ019-0170": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: More officers were appointed, as the time of so many of those already on the staff" }, "LJSpeech_LJ041-0092": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: and quote, individual that you would brainwash and quite easy, but I think" }, "LJSpeech_LJ039-0027": { "labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:happy\ttranscription: and switch with Azure." }, "LJSpeech_LJ030-0077": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: Each agent carried a .38 caliber pistol and a shotgun and" }, "LJSpeech_LJ038-0112": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:neutral\ttranscription: As previously indicated, Marina Oswald testified that she took the" }, "LJSpeech_LJ017-0119": { "labels": "age:Elderly\tgender:male\tpitch:high\tvolume:low\tspeed:fast\temotion:happy\ttranscription: All the circumstances were so suspicious that he could not escape the criminal charge." }, "LJSpeech_LJ007-0242": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: No complete and permanent improvement was indeed possible while Newgate remained unchanged." }, "LJSpeech_LJ020-0046": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:happy\ttranscription: Close the dough over it, dust your hands and kneading board with flour," }, "LJSpeech_LJ018-0001": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: The Chronicles of Newgate, Volume 2, by Arthur Griffiths. Section 21." }, "LJSpeech_LJ026-0098": { "labels": "age:Elderly\tgender:male\tpitch:high\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: The circulatory system distributes these foods in animals," }, "LJSpeech_LJ021-0152": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:neutral\ttranscription: and to experiment for a reasonable time with measures suitable to" }, "LJSpeech_LJ047-0223": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:slow\temotion:neutral\ttranscription: I don't recall the exact date, it was about a week prior.\"" }, "LJSpeech_LJ007-0040": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:happy\ttranscription: In a second room were fourteen more who had every hope of a reprieve." }, "LJSpeech_LJ050-0103": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:normal\temotion:sad\ttranscription: were all men who acted alone in their criminal acts against our leaders." }, "LJSpeech_LJ037-0016": { "labels": "age:Elderly\tgender:male\tpitch:high\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: Scoggins hurriedly left his seat and hid behind the cab, as the man came back to" }, "LJSpeech_LJ028-0336": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:sad\ttranscription: Darius now, still keeping to the plan agreed upon." }, "LJSpeech_LJ004-0233": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: Under the new rule, visitors were not allowed to pass into the interior of the prison," }, "LJSpeech_LJ024-0127": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:slow\temotion:angry\ttranscription: You who know me can have no fear that I would tolerate the destruction" }, "LJSpeech_LJ004-0189": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:slow\temotion:happy\ttranscription: The great principles of classification, cleanliness and employment were closely observed." }, "LJSpeech_LJ005-0045": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: Nor did it confine itself to mere verbal recommendations." }, "LJSpeech_LJ005-0158": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:happy\ttranscription: Bedding and clothing was still denied, but only in a few jails." }, "LJSpeech_LJ011-0018": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: The crime, long carried on without detection, was first discovered in 1820." }, "LJSpeech_LJ017-0022": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:normal\temotion:neutral\ttranscription: was that of Eliza Fenning, who was convicted of an attempt to poison a whole family." }, "LJSpeech_LJ030-0136": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:normal\temotion:sad\ttranscription: The President replied, that is very obvious." }, "LJSpeech_LJ039-0078": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: The effect of a four-power telescopic sight on the difficulty of these shots" }, "LJSpeech_LJ048-0254": { "labels": "age:Elderly\tgender:male\tpitch:high\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: advised in the course of the Secret Service investigation of these events that each agent" }, "LJSpeech_LJ049-0083": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:angry\ttranscription: It has long been a federal crime to conspire to injure any federal officer on account of" }, "LJSpeech_LJ006-0043": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: The disgraceful overcrowding had been partially ended, but the same evils of" }, "LJSpeech_LJ025-0109": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: has been as completely invalidated as the third and second." }, "LJSpeech_LJ044-0163": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: Marina Oswald testified that her husband engaged in fair play for Cuba." }, "LJSpeech_LJ003-0037": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: A site was purchased between Red Lion and White Cross streets and a new" }, "LJSpeech_LJ013-0117": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:happy\ttranscription: and with another carry off the plate-chest in broad daylight and as a matter of business." }, "LJSpeech_LJ027-0096": { "labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: as indeed she must be according to the derivation theory." }, "LJSpeech_LJ021-0035": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:happy\ttranscription: saved debtors and creditors alike in many other fields of enterprise." }, "LJSpeech_LJ049-0159": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: the Secret Service, then the only federal investigative agency, assumed" }, "LJSpeech_LJ039-0100": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:happy\ttranscription: During the first week of an intensive eight-week training period, he received and" }, "LJSpeech_LJ019-0302": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:slow\temotion:neutral\ttranscription: In 1862, there were in all 193 jails." }, "LJSpeech_LJ030-0229": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: At this point, the cars were speeding through the underpass and had left the scene of the" }, "LJSpeech_LJ005-0151": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: In others, women were very properly exempted from it, and also from all severe" }, "LJSpeech_LJ015-0115": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:fast\temotion:sad\ttranscription: Little more remains to be said about Robson. He appears to have accepted his position." }, "LJSpeech_LJ010-0279": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: I shall mention briefly one more case, in which, however, there was no murderous" }, "LJSpeech_LJ044-0119": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: into which Oswald, in his own words, had quote, thrown himself. He sought it" }, "LJSpeech_LJ049-0115": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: of the person who is actually in the exercise of the executive power or" }, "LJSpeech_LJ004-0139": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:fast\temotion:happy\ttranscription: In the morning, the stench and heat were so oppressive that he and everyone else" }, "LJSpeech_LJ007-0223": { "labels": "age:Elderly\tgender:male\tpitch:high\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: The prison officials appear to be on the side of the inspectors, to the great dissatisfaction" }, "LJSpeech_LJ032-0070": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:normal\temotion:neutral\ttranscription: Ordinarily, Inspector Holmes testified, identification is not requested because" }, "LJSpeech_LJ015-0294": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: He forgot to add that it was to be placed to Ralph's credit, and when he called" }, "LJSpeech_LJ014-0083": { "labels": "age:Elderly\tgender:male\tpitch:high\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: which, having possessed herself of the murdered man's keys, she rifled from" }, "LJSpeech_LJ040-0195": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: It appears that he did not want to do any of the things which the authorities suggest" }, "LJSpeech_LJ030-0028": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: and the Vice President and Mrs. Johnson were in the receiving line to greet President" }, "LJSpeech_LJ009-0145": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: and he seemed to suffer great inward agitation when the ordinary, particularly" }, "LJSpeech_LJ031-0117": { "labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: assisted by doctors William Osborne and John Parker." }, "LJSpeech_LJ004-0115": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:happy\ttranscription: Infirmaries separating the sexes were also to be provided. A chapel too." }, "LJSpeech_LJ039-0200": { "labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: And one of these agents, Robert A. Frazier," }, "LJSpeech_LJ028-0481": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:slow\temotion:happy\ttranscription: Nabo-Polisar, the father, my begetter, built Imgur-Bel." }, "LJSpeech_LJ003-0245": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:slow\temotion:disgusted\ttranscription: preparatory to their appearance in the Old Bailey. Irons were seldom removed." }, "LJSpeech_LJ017-0261": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:neutral\ttranscription: Seven were found guilty of murder on the high seas and one, Carlos, a criminal." }, "LJSpeech_LJ007-0137": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: More attention to ventilation, which was altogether neglected and inadequate, would" }, "LJSpeech_LJ041-0017": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: Wobel also recalled that Oswald once outlined a plan to cut the glass in the" }, "LJSpeech_LJ043-0061": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: At the time of his defection, when he evidenced no interest in his father and" }, "LJSpeech_LJ033-0097": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: Frazier parked the car in the company parking lot about two blocks north of the depository" }, "LJSpeech_LJ019-0234": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:slow\temotion:sad\ttranscription: The old buildings were entirely disused, and the whole of the inmates of Newgate were" }, "LJSpeech_LJ016-0296": { "labels": "age:Middle-aged\tgender:male\tpitch:high\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: The actual execution made some impression." }, "LJSpeech_LJ023-0072": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:neutral\ttranscription: Congress passed a statute which, in 1803, the courts" }, "LJSpeech_LJ050-0100": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: The Commission recognizes that no set of meaningful criteria will yield" }, "LJSpeech_LJ028-0113": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: With mortar and bricks, he built two moat walls about the city." }, "LJSpeech_LJ013-0121": { "labels": "age:Elderly\tgender:male\tpitch:high\tvolume:normal\tspeed:normal\temotion:sad\ttranscription: Howe and his accomplice were arrested. The former was found guilty and sentenced" }, "LJSpeech_LJ014-0273": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: The detection of these frauds came while he was still prominently before the world as" }, "LJSpeech_LJ027-0136": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: Illustrations quoted from the works of Romains and Locante will make this principle" }, "LJSpeech_LJ003-0067": { "labels": "age:Middle-aged\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: in the then existing state of the law," }, "LJSpeech_LJ039-0220": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: and that one would not have to be an expert marksman to have accomplished the assassination." }, "LJSpeech_LJ018-0029": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: A photograph of Mueller shown the jeweler was identified as the likeness of the jewel." }, "LJSpeech_LJ001-0080": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: He seems to have taken the letter of the Elseviers of the 17th century for his mom." }, "LJSpeech_LJ028-0454": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: But both sections originally reached the river." }, "LJSpeech_LJ037-0244": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: Westbrook identified Commission Exhibit 162 as the light-colored" }, "LJSpeech_LJ008-0174": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:slow\temotion:sad\ttranscription: One cartload of spectators having broken down, some of its occupants fell" }, "LJSpeech_LJ009-0302": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:sad\ttranscription: Another man was hired, himself a convict, whose fees for self and wife were" }, "LJSpeech_LJ002-0182": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: the fleet and the Marshall Sea prisons especially devoted to them." }, "LJSpeech_LJ048-0023": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: and he had told us during one of the interviews that he would probably take his wife back to" }, "LJSpeech_LJ019-0257": { "labels": "age:Elderly\tgender:male\tpitch:high\tvolume:low\tspeed:slow\temotion:sad\ttranscription: Here, the tread wheel was in use. There, cellular cranks." }, "LJSpeech_LJ040-0046": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:slow\temotion:neutral\ttranscription: which one associate described as, quote, irrevocable, end quote." }, "LJSpeech_LJ006-0295": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: The governor was also personally responsible for gross contravention of this rule of" }, "LJSpeech_LJ028-0252": { "labels": "age:Middle-aged\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: Only the king's son, Belshazzar, was killed." }, "LJSpeech_LJ035-0002": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:slow\temotion:neutral\ttranscription: Chapter 4 The Assassin Part 4 Oswald's Actions" }, "LJSpeech_LJ009-0189": { "labels": "age:Elderly\tgender:male\tpitch:high\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: Persons were still living in 1855 who had witnessed dissections at Hicks Hall." }, "LJSpeech_LJ011-0032": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: declared that they had hitherto formed a high opinion of his honor, integrity, and" }, "LJSpeech_LJ042-0155": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: It appears to be the work of a fairly well-organized person." }, "LJSpeech_LJ004-0039": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:sad\ttranscription: They were hopeless of any general reform by the action of the executive alone." }, "LJSpeech_LJ025-0157": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: under these circumstances, unnatural as they are, with proper management" }, "LJSpeech_LJ030-0121": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:normal\temotion:happy\ttranscription: Several times, Special Agent John D. Reddy came forward from the right front" }, "LJSpeech_LJ010-0135": { "labels": "age:Elderly\tgender:male\tpitch:high\tvolume:low\tspeed:fast\temotion:happy\ttranscription: yelled out three cheers to the populace whom he faced." }, "LJSpeech_LJ050-0118": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: Since these agencies are already obliged constantly to evaluate the activities" }, "LJSpeech_LJ040-0175": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:normal\temotion:neutral\ttranscription: through which they could not reach him, but that he preferred the veil to remain intact." }, "LJSpeech_LJ013-0042": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: the foundations of which had been laid by buying old ships on purpose to cast the more" }, "LJSpeech_LJ014-0076": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:happy\ttranscription: He was seen afterwards smoking and talking with his hosts in their back parlor." }, "LJSpeech_LJ008-0097": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:normal\temotion:sad\ttranscription: No sooner was the job finished than half a dozen competitors appeared." }, "LJSpeech_LJ043-0166": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: Possibly he might have wanted to be caught and wanted his involvement made clear" }, "LJSpeech_LJ045-0127": { "labels": "age:Elderly\tgender:male\tpitch:high\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: He absolved the Soviet embassy in Mexico City of any blame for his difficulties." }, "LJSpeech_LJ027-0117": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:normal\temotion:happy\ttranscription: Now, here again the former theory appears to be triumphant over the latter." }, "LJSpeech_LJ035-0076": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:happy\ttranscription: Special Agent John Howlett of the Secret Service carried a rifle from the south" }, "LJSpeech_LJ005-0101": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:sad\ttranscription: Quince it deduced the practice and condition of every prison that replied." }, "LJSpeech_LJ028-0302": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:happy\ttranscription: I will desert to the enemy as I am, and when I get into their city," }, "LJSpeech_LJ036-0196": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: Tippett patrolled District 78 in the Oak Cliff area of Dixie." }, "LJSpeech_LJ028-0191": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:happy\ttranscription: The old enemies of Babylon rejoiced." }, "LJSpeech_LJ038-0214": { "labels": "age:Elderly\tgender:male\tpitch:high\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: and the other paragraphs instructed her on the disposal of Oswald's personal effects" }, "LJSpeech_LJ003-0304": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: with the penalty of forfeiting the day's allowance of food, an increase of which the committee" }, "LJSpeech_LJ019-0009": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: or to insist upon the construction of prisons on the most approved plan." }, "LJSpeech_LJ022-0192": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: They contemplate the enrichment of our national life." }, "LJSpeech_LJ007-0026": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:sad\ttranscription: And with all this, the most dreadful oaths, the worst language, too bad to be repeated." }, "LJSpeech_LJ035-0158": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: A blue jacket, later identified by Marina Oswald as her husband's, was" }, "LJSpeech_LJ033-0014": { "labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:normal\tspeed:normal\temotion:sad\ttranscription: Lee Harvey Oswald lived in a rooming house in Dallas, while his wife and children lived" }, "LJSpeech_LJ018-0369": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:normal\temotion:neutral\ttranscription: the mysterious Bravo case, that of Dr. Lamson, and that of" }, "LJSpeech_LJ017-0028": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: that she had had a quarrel with her mistress, and that the latter, with all others," }, "LJSpeech_LJ004-0109": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: according to their categories or crimes." }, "LJSpeech_LJ011-0239": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: Where robbery with violence was intended, the perpetrators had now to adopt various" }, "LJSpeech_LJ049-0034": { "labels": "age:Elderly\tgender:male\tpitch:high\tvolume:normal\tspeed:normal\temotion:surprised\ttranscription: could have reached the President in time to protect him from the second and fatal shot" }, "LJSpeech_LJ016-0389": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: Giovanni Lanni, the Italian boy who murdered a French woman in the hay market." }, "LJSpeech_LJ037-0018": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:slow\temotion:sad\ttranscription: Skaggins saw him and heard him mutter either, Poor damn cop or..." }, "LJSpeech_LJ041-0163": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: Out of a combination of Oswald's known Marxist sympathies and George Orwell's" }, "LJSpeech_LJ017-0178": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:surprised\ttranscription: It appeared that several persons with whom she was intimate had succumbed suddenly." }, "LJSpeech_LJ048-0006": { "labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: or to the Vice President.\"" }, "LJSpeech_LJ002-0184": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: the latter two being also a prison for felons and vagrants arrested within certain" }, "LJSpeech_LJ012-0143": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: Money Moses had received the stolen gold dust from Moss's father-in-law, Davis." }, "LJSpeech_LJ023-0012": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:neutral\ttranscription: But when almost two years later it came before the Supreme" }, "LJSpeech_LJ036-0068": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:sad\ttranscription: Both buses stopped within one block of the depository building." }, "LJSpeech_LJ046-0158": { "labels": "age:Elderly\tgender:male\tpitch:high\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: At the time of the assassination, the active PRS general files contained" }, "LJSpeech_LJ011-0287": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: but at length managed to wriggle out of the chain which confined his body and" }, "LJSpeech_LJ016-0399": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: Wainwright was allowed a cigar the night before execution, which he smoked in the prison yard." }, "LJSpeech_LJ006-0012": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: but he would not arm them with any authority lest their cooperation might be offensive." }, "LJSpeech_LJ005-0102": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: Upon these and the private visitations made by various members, the Society" }, "LJSpeech_LJ001-0169": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:fast\temotion:neutral\ttranscription: The paper used for printing the small, highly ornamented French service books." }, "LJSpeech_LJ011-0271": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: Mr. G went in the coach sent for him and alighted at 27 York Street," }, "LJSpeech_LJ014-0057": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:fast\temotion:neutral\ttranscription: London did not escape the contagion and, prominent among the detestable crimes of" }, "LJSpeech_LJ008-0191": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:slow\temotion:disgusted\ttranscription: At Courvoisier's execution in 1840, it was the same, or worse." }, "LJSpeech_LJ007-0120": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:sad\ttranscription: some to the infirmary, many more to the governor's house." }, "LJSpeech_LJ012-0268": { "labels": "age:Elderly\tgender:male\tpitch:high\tvolume:low\tspeed:normal\temotion:neutral\ttranscription: Suspicion grew almost to certainty as the evidence was unfolded." }, "LJSpeech_LJ015-0298": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:happy\ttranscription: But while Hardwick was in communication with Sawward, the bank was in communication with" }, "LJSpeech_LJ030-0213": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:neutral\ttranscription: Hill heard a second shot, approximately five seconds after the first, which removed" }, "LJSpeech_LJ009-0121": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:normal\temotion:happy\ttranscription: for all thy goodness and loving kindness to us and to all men." }, "LJSpeech_LJ037-0084": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:neutral\ttranscription: Barbara Jeanette Davis testified that no one had shown her a picture of Oswald before." }, "LJSpeech_LJ009-0219": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:happy\ttranscription: A clause was inserted to the effect that" }, "LJSpeech_LJ040-0143": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: Contrary to reports that appeared after the assassination, the psychiatric examiner" }, "LJSpeech_LJ006-0048": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: To these were still added an average of about 50, expecting the last penalty." }, "LJSpeech_LJ033-0006": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: In this connection, the Commission considered one" }, "LJSpeech_LJ044-0207": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:high\tspeed:normal\temotion:neutral\ttranscription: the economic embargo against that country, and the general policy of the United States." }, "LJSpeech_LJ032-0260": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:normal\tspeed:normal\temotion:neutral\ttranscription: He thought it contained tent poles or possibly other camping equipment, such as" }, "LJSpeech_LJ017-0030": { "labels": "age:Elderly\tgender:male\tpitch:high\tvolume:high\tspeed:normal\temotion:happy\ttranscription: When the spread of scientific knowledge places nefarious means at the disposal of" }, "LJSpeech_LJ005-0084": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:normal\temotion:neutral\ttranscription: so as to prevent them from seeing, conversing, or holding any interaction." }, "LJSpeech_LJ008-0236": { "labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: Coiled up on the floor of the scaffold like a serpent, the hangman's rope." }, "LJSpeech_LJ027-0020": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:happy\ttranscription: The unity in life, then, is not less a fact than is life's great diversity." }, "LJSpeech_LJ017-0238": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:low\tspeed:slow\temotion:fearful\ttranscription: Tefer the second mate agreed, but constantly went in fear of his life." }, "LJSpeech_LJ045-0233": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:normal\tspeed:slow\temotion:angry\ttranscription: He consistently refused to admit involvement in the assassination" }, "LJSpeech_LJ038-0099": { "labels": "age:Middle-aged\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: From the outset, Oswald denied owning a rifle." }, "LJSpeech_LJ028-0231": { "labels": "age:Elderly\tgender:male\tpitch:low\tvolume:low\tspeed:fast\temotion:neutral\ttranscription: whereupon they withdrew within their defenses." }, "LJSpeech_LJ044-0239": { "labels": "age:Elderly\tgender:male\tpitch:high\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: and raises serious questions as to whether or not he ever expected to" }, "LJSpeech_LJ050-0086": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: Under these criteria, whether the case should be referred to the Secret Service depends" }, "LJSpeech_LJ047-0021": { "labels": "age:Elderly\tgender:male\tpitch:normal\tvolume:high\tspeed:slow\temotion:neutral\ttranscription: information regarding his relations with the U.S. Embassy in Moscow and background" } } ================================================ FILE: AutomaticPipeline/outputs/labels_LJspeech_0.scp ================================================ LJSpeech LJ010-0227 68 male 968.5380249023438 0.8162265419960022 0.022741336633663366 sad He went from Newgate first to Bethlehem, from which he was removed to Broadmoor. LJSpeech LJ050-0106 59 male 906.479736328125 0.8606864213943481 0.022741336633663366 neutral to devise a practical system which has any reasonable possibility of revealing LJSpeech LJ043-0140 57 male 1299.147705078125 0.8262879848480225 0.022741336633663366 neutral He also studied Dallas bus schedules to prepare for his later use of LJSpeech LJ018-0032 66 male 1098.609619140625 0.763823390007019 0.022741336633663366 sad There was no mystery about his departure. He had gone to Canada by the Victoria LJSpeech LJ012-0117 62 male 988.9229736328125 0.8686652183532715 0.022741336633663366 neutral By and by the occupant of the room noticed something glittering in the center of the fire. LJSpeech LJ038-0057 65 male 1146.2008056640625 0.838049590587616 0.022741336633663366 neutral Deputy Sheriff Walther's brought a shotgun into the theater, but laid it on some LJSpeech LJ017-0158 62 male 951.9786376953125 0.8371432423591614 0.022741336633663366 happy She had a little fortune of her own, some 1,700 pounds LJSpeech LJ026-0094 53 male 1013.83544921875 0.8242807984352112 0.022741336633663366 neutral This however is probably not a source of vital energy, but only contributes to the LJSpeech LJ045-0152 65 male 928.56396484375 0.8731129169464111 0.024118640350877192 happy The events of that evening can best be appreciated through Marina Oswald's testimony. LJSpeech LJ019-0264 67 male 1011.5900268554688 0.874998152256012 0.024177631578947367 neutral and it was decidedly of opinion that in all short sentences the hard labor LJSpeech LJ040-0047 63 male 1101.93212890625 0.5375705361366272 0.016539692982456142 neutral That associate did not think that Oswald was a communist. LJSpeech LJ012-0015 63 male 1143.9190673828125 0.8230547904968262 0.024177631578947367 neutral Weedon and LaCassar to 12 and 6 months respectively in cold baths. LJSpeech LJ019-0097 63 male 976.0210571289062 0.868423581123352 0.024009146341463415 sad The want of uniformity in prison discipline became air long and acknowledged evil. LJSpeech LJ039-0093 65 male 1167.13720703125 0.7914684414863586 0.024009146341463415 neutral at a distance of 265.3 feet was, quote, LJSpeech LJ018-0028 63 female 1036.7177734375 0.8202313780784607 0.024009146341463415 sad who had been a lodger of his. Muehler had given the cabman's little daughter LJSpeech LJ028-0287 66 male 1013.0613403320312 0.7936339974403381 0.024009146341463415 angry but found none by which he could hope to prevail unless he maimed himself. LJSpeech LJ024-0117 65 female 746.5477294921875 0.8057168126106262 0.024093094405594404 neutral You will find that many of those who pretend to support you will sabotage your plans. LJSpeech LJ029-0172 64 male 1144.1251220703125 0.8575349450111389 0.024093094405594404 neutral The two Dallas newspapers provided their readers with a steady stream of information and LJSpeech LJ024-0044 59 male 1082.681884765625 0.8042803406715393 0.024093094405594404 angry that I will appoint justices who will act as justices and not as LJSpeech LJ045-0246 60 male 916.866943359375 0.77986079454422 0.024093094405594404 sad He sought for himself a place in history, a role as the great LJSpeech LJ042-0191 59 male 1080.6220703125 0.7938746213912964 0.026502403846153846 neutral While, quote, resourcefulness and patient working towards the aforesaid goal LJSpeech LJ050-0099 66 male 1119.674072265625 0.8032099604606628 0.026502403846153846 neutral It is apparent that a good deal of further consideration and experimentation LJSpeech LJ017-0101 63 male 889.54541015625 0.7770697474479675 0.026502403846153846 neutral That day, Palmer had bought more strychnia and had called in a fresh doctor. LJSpeech LJ016-0388 56 male 941.869384765625 0.5061151385307312 0.01628389423076923 happy A few go further and are almost gluttonous. LJSpeech LJ007-0129 66 male 657.7396850585938 0.7841930985450745 0.022816639072847682 angry the sane and the insane, the young and the old, the trivial offender and the LJSpeech LJ014-0195 62 male 899.5761108398438 0.8505150675773621 0.022816639072847682 neutral Two of them supported Cope, who was still alive, although insensible, and Marley LJSpeech LJ027-0056 60 male 1063.888671875 0.8467499017715454 0.02244308774834437 happy The relation of skeleton and muscle in arthropods is exactly the reverse. LJSpeech LJ030-0227 61 male 1070.8468017578125 0.8109928965568542 0.022816639072847682 neutral Special Agent George W. Hickey Jr. in the rear seat of the Presidential LJSpeech LJ047-0210 53 male 734.2883911132812 0.4516647458076477 0.013350046641791045 neutral I think all of those, if we had them all together... LJSpeech LJ048-0228 69 male 914.8754272460938 0.8221648335456848 0.025711287313432835 neutral and others who were present say that no agent was inebriated or acted improperly. LJSpeech LJ009-0104 63 male 910.867919921875 0.8145198225975037 0.025711287313432835 neutral The women set up a yell, which is mixed with a rustling noise, occasioned by the LJSpeech LJ017-0085 61 male 1220.1307373046875 0.800680935382843 0.025711287313432835 neutral Palmer's plan was to administer poison in quantities insufficient to LJSpeech LJ018-0275 66 male 1315.436279296875 0.8228096961975098 0.025520833333333333 neutral On Tarpey's defense, it was stated that the idea of the theft had been suggested LJSpeech LJ027-0064 54 male 993.9934692382812 0.5912888646125793 0.019532638888888888 neutral All vertebrates and none other have two cavities. LJSpeech LJ036-0084 59 male 1092.259521484375 0.8476306796073914 0.025520833333333333 neutral Craig also claimed that when Fritz pointed out to Oswald that Craig had identified LJSpeech LJ040-0211 64 male 897.1492919921875 0.7998844981193542 0.025520833333333333 neutral This would make him approximately ten, well almost eleven years old. LJSpeech LJ032-0241 59 male 979.5408935546875 0.8054627180099487 0.02630009541984733 happy Ten days prior to the Walker attempt, Oswald had undoubtedly received the rifle. LJSpeech LJ002-0158 61 male 1082.5684814453125 0.8261975646018982 0.02630009541984733 neutral Other cases are recorded elsewhere, as at the Gilsburg Street Comptor, where LJSpeech LJ050-0077 58 male 1372.689453125 0.3803936243057251 0.011823711832061068 neutral and in taking preventive steps. LJSpeech LJ032-0236 64 male 1258.6331787109375 0.8708815574645996 0.02630009541984733 neutral By checking the actual mailing dates of these issues and the time it usually takes LJSpeech LJ026-0102 58 male 1112.703857421875 0.7878386974334717 0.02505681818181818 happy but root pressure due to osmosis, capillary action and evaporation. LJSpeech LJ025-0057 63 male 1100.768798828125 0.8493995666503906 0.02505681818181818 neutral such as the charai, are in constant and regular motion, was made out LJSpeech LJ018-0270 61 male 1052.5274658203125 0.8861708641052246 0.02505681818181818 neutral The assistant called with the jewels on approbation at a house specially hired for LJSpeech LJ013-0021 65 male 896.8339233398438 0.7397869229316711 0.02505681818181818 sad The larboard pump was suffered to remain choked up and the longboat was LJSpeech LJ042-0050 62 male 895.3709716796875 0.8398865461349487 0.023122902684563757 sad I have been a pro-communist for years and yet I have never met a communist. LJSpeech LJ006-0075 65 male 1124.275634765625 0.8591945767402649 0.023122902684563757 happy He saw certain rooms fill up, and yet took no steps to open others that were locked LJSpeech LJ048-0187 62 male 1194.6612548828125 0.8206942677497864 0.023122902684563757 neutral In addition, Secret Service agents riding in the motorcade were trained to LJSpeech LJ022-0186 59 male 1200.8306884765625 0.8014236688613892 0.023122902684563757 happy Twenty years of experience with this system have justified the efforts made to LJSpeech LJ013-0078 67 male 1060.476318359375 0.8209429979324341 0.02173698738170347 neutral Barbara was subsequently pardoned, but was not replaced on the roles as an attorney. LJSpeech LJ006-0015 62 male 1111.1729736328125 0.8129136562347412 0.02173698738170347 neutral one which required discretion, judgment and knowledge of law, with sufficient LJSpeech LJ048-0061 64 male 1116.922119140625 0.8459006547927856 0.02173698738170347 neutral Under proper procedures, knowledge of the pending Presidential visit might have prompted LJSpeech LJ015-0190 63 male 1128.6842041015625 0.7676864862442017 0.02173698738170347 sad that of dishonest rogues who assume piety and philanthropy as a cloak for the world. LJSpeech LJ003-0115 62 male 1147.26904296875 0.832510232925415 0.023842993079584776 neutral The judge sat in proper form. He was punctiliously styled, my LJSpeech LJ045-0244 67 male 1213.853271484375 0.797639787197113 0.023842993079584776 angry Long before the assassination, he expressed his hatred for American society. LJSpeech LJ004-0051 66 male 850.4898681640625 0.8350055813789368 0.023842993079584776 happy When in Belgium he had examined with great satisfaction the admirable manner LJSpeech LJ011-0265 62 male 1222.8548583984375 0.8727220892906189 0.023842993079584776 sad This Mr. Canning had left his widow a life interest in two thousand pounds. LJSpeech LJ028-0087 53 male 983.9290161132812 0.5809707641601562 0.01661304151624549 neutral Such was the appearance of the builder of the walls of Babylon. LJSpeech LJ028-0178 65 male 945.8756713867188 0.8222951292991638 0.024875902527075812 happy The walls of Babylon were so long and wide and high that all who LJSpeech LJ011-0113 64 male 1133.138427734375 0.8332394957542419 0.024875902527075812 sad He met his death with unshaken firmness, only in treating that a certain blue LJSpeech LJ039-0162 61 male 1100.3876953125 0.8697624802589417 0.024875902527075812 sad In an effort to test the rifle under conditions which simulated those which prevailed during LJSpeech LJ045-0066 53 male 1083.7061767578125 0.3979305326938629 0.014269723360655738 sad Oswald struck his wife on occasion. LJSpeech LJ028-0277 50 male 1167.56591796875 0.5111944675445557 0.017220543032786886 surprised One of his Sumter mules gave birth to a foal. LJSpeech LJ048-0197 64 male 992.088134765625 0.8570470213890076 0.028240266393442622 neutral I then told the officers that their primary duty was traffic and crowd control and that LJSpeech LJ019-0170 66 male 1029.4417724609375 0.8437999486923218 0.028240266393442622 neutral More officers were appointed, as the time of so many of those already on the staff LJSpeech LJ041-0092 59 male 839.0108642578125 0.7990822792053223 0.030625 neutral and quote, individual that you would brainwash and quite easy, but I think LJSpeech LJ039-0027 52 male 838.1315307617188 0.2564748227596283 0.00900361111111111 happy and switch with Azure. LJSpeech LJ030-0077 64 male 1158.3037109375 0.809492826461792 0.030625 neutral Each agent carried a .38 caliber pistol and a shotgun and LJSpeech LJ038-0112 64 male 1158.54931640625 0.7811505198478699 0.030625 neutral As previously indicated, Marina Oswald testified that she took the LJSpeech LJ017-0119 62 male 1471.981689453125 0.7784394025802612 0.022372159090909092 happy All the circumstances were so suspicious that he could not escape the criminal charge. LJSpeech LJ007-0242 63 male 898.47412109375 0.860797107219696 0.022161728896103895 neutral No complete and permanent improvement was indeed possible while Newgate remained unchanged. LJSpeech LJ020-0046 66 male 936.8101806640625 0.7930992841720581 0.022372159090909092 happy Close the dough over it, dust your hands and kneading board with flour, LJSpeech LJ018-0001 68 male 1067.754638671875 0.8335044980049133 0.022372159090909092 neutral The Chronicles of Newgate, Volume 2, by Arthur Griffiths. Section 21. LJSpeech LJ026-0098 57 male 1459.13232421875 0.8028250336647034 0.02745268924302789 neutral The circulatory system distributes these foods in animals, LJSpeech LJ021-0152 61 male 1135.7069091796875 0.7745066285133362 0.02745268924302789 neutral and to experiment for a reasonable time with measures suitable to LJSpeech LJ047-0223 62 male 809.3446655273438 0.6974994540214539 0.02534586653386454 neutral I don't recall the exact date, it was about a week prior." LJSpeech LJ007-0040 61 male 817.2888793945312 0.6683530211448669 0.020692480079681275 happy In a second room were fourteen more who had every hope of a reprieve. LJSpeech LJ050-0103 63 male 908.8638305664062 0.795621395111084 0.02478642086330935 sad were all men who acted alone in their criminal acts against our leaders. LJSpeech LJ037-0016 62 male 1227.5238037109375 0.8029683232307434 0.02478642086330935 neutral Scoggins hurriedly left his seat and hid behind the cab, as the man came back to LJSpeech LJ028-0336 59 male 1117.00927734375 0.6433566808700562 0.018222347122302158 sad Darius now, still keeping to the plan agreed upon. LJSpeech LJ004-0233 63 male 873.744384765625 0.8720524311065674 0.02478642086330935 neutral Under the new rule, visitors were not allowed to pass into the interior of the prison, LJSpeech LJ024-0127 66 male 790.1196899414062 0.8227817416191101 0.025333180147058824 angry You who know me can have no fear that I would tolerate the destruction LJSpeech LJ004-0189 67 male 882.4647216796875 0.8654625415802002 0.025333180147058824 happy The great principles of classification, cleanliness and employment were closely observed. LJSpeech LJ005-0045 60 male 1204.24072265625 0.5985238552093506 0.017624310661764703 neutral Nor did it confine itself to mere verbal recommendations. LJSpeech LJ005-0158 63 male 1125.4774169921875 0.658680260181427 0.021271369485294116 happy Bedding and clothing was still denied, but only in a few jails. LJSpeech LJ011-0018 65 male 1102.3902587890625 0.8398444652557373 0.024434840425531915 neutral The crime, long carried on without detection, was first discovered in 1820. LJSpeech LJ017-0022 63 male 950.7742919921875 0.7860767841339111 0.024434840425531915 neutral was that of Eliza Fenning, who was convicted of an attempt to poison a whole family. LJSpeech LJ030-0136 58 male 961.9500122070312 0.7000767588615417 0.024434840425531915 sad The President replied, that is very obvious. LJSpeech LJ039-0078 59 male 994.359375 0.811722457408905 0.024434840425531915 neutral The effect of a four-power telescopic sight on the difficulty of these shots LJSpeech LJ048-0254 63 male 1418.6072998046875 0.8071127533912659 0.023045568561872908 neutral advised in the course of the Secret Service investigation of these events that each agent LJSpeech LJ049-0083 63 male 1159.44287109375 0.8841239213943481 0.023045568561872908 angry It has long been a federal crime to conspire to injure any federal officer on account of LJSpeech LJ006-0043 61 male 963.5223999023438 0.8555874228477478 0.023045568561872908 neutral The disgraceful overcrowding had been partially ended, but the same evils of LJSpeech LJ025-0109 62 male 1129.241943359375 0.8006194829940796 0.023045568561872908 neutral has been as completely invalidated as the third and second. LJSpeech LJ044-0163 62 male 957.9028930664062 0.8367945551872253 0.02478642086330935 neutral Marina Oswald testified that her husband engaged in fair play for Cuba. LJSpeech LJ003-0037 65 male 1216.96533203125 0.8291783332824707 0.02478642086330935 neutral A site was purchased between Red Lion and White Cross streets and a new LJSpeech LJ013-0117 61 male 1048.94189453125 0.8147414922714233 0.02478642086330935 happy and with another carry off the plate-chest in broad daylight and as a matter of business. LJSpeech LJ027-0096 52 male 987.2114868164062 0.5842734575271606 0.01701371402877698 neutral as indeed she must be according to the derivation theory. LJSpeech LJ021-0035 61 male 1195.374755859375 0.8064736127853394 0.024875902527075812 happy saved debtors and creditors alike in many other fields of enterprise. LJSpeech LJ049-0159 67 male 1158.23046875 0.8147270679473877 0.024875902527075812 neutral the Secret Service, then the only federal investigative agency, assumed LJSpeech LJ039-0100 60 male 1073.9599609375 0.7953296303749084 0.024875902527075812 happy During the first week of an intensive eight-week training period, he received and LJSpeech LJ019-0302 66 male 927.6077270507812 0.7891265153884888 0.024875902527075812 neutral In 1862, there were in all 193 jails. LJSpeech LJ030-0229 59 male 1128.42724609375 0.8108355402946472 0.0216686320754717 neutral At this point, the cars were speeding through the underpass and had left the scene of the LJSpeech LJ005-0151 55 male 1023.0142211914062 0.8571876287460327 0.0216686320754717 neutral In others, women were very properly exempted from it, and also from all severe LJSpeech LJ015-0115 69 male 1087.5670166015625 0.8020297884941101 0.0216686320754717 sad Little more remains to be said about Robson. He appears to have accepted his position. LJSpeech LJ010-0279 66 male 988.0106201171875 0.8697272539138794 0.0216686320754717 neutral I shall mention briefly one more case, in which, however, there was no murderous LJSpeech LJ044-0119 62 male 947.2424926757812 0.8160279393196106 0.022372159090909092 neutral into which Oswald, in his own words, had quote, thrown himself. He sought it LJSpeech LJ049-0115 65 male 1047.099853515625 0.8133270740509033 0.022372159090909092 neutral of the person who is actually in the exercise of the executive power or LJSpeech LJ004-0139 64 male 965.0425415039062 0.8386383652687073 0.022372159090909092 happy In the morning, the stench and heat were so oppressive that he and everyone else LJSpeech LJ007-0223 61 male 1323.4381103515625 0.8278164267539978 0.022372159090909092 neutral The prison officials appear to be on the side of the inspectors, to the great dissatisfaction LJSpeech LJ032-0070 67 male 1133.4202880859375 0.764536440372467 0.023122902684563757 neutral Ordinarily, Inspector Holmes testified, identification is not requested because LJSpeech LJ015-0294 64 male 1171.9451904296875 0.816940426826477 0.023122902684563757 neutral He forgot to add that it was to be placed to Ralph's credit, and when he called LJSpeech LJ014-0083 60 male 1259.9434814453125 0.8244383335113525 0.023122902684563757 neutral which, having possessed herself of the murdered man's keys, she rifled from LJSpeech LJ040-0195 67 male 1063.7657470703125 0.8319800496101379 0.023122902684563757 neutral It appears that he did not want to do any of the things which the authorities suggest LJSpeech LJ030-0028 57 male 1052.3692626953125 0.8686795830726624 0.024093094405594404 neutral and the Vice President and Mrs. Johnson were in the receiving line to greet President LJSpeech LJ009-0145 63 male 1096.4228515625 0.8129713535308838 0.024093094405594404 neutral and he seemed to suffer great inward agitation when the ordinary, particularly LJSpeech LJ031-0117 50 male 923.3846435546875 0.5800350308418274 0.01670563811188811 neutral assisted by doctors William Osborne and John Parker. LJSpeech LJ004-0115 64 male 937.3268432617188 0.8438901305198669 0.024093094405594404 happy Infirmaries separating the sexes were also to be provided. A chapel too. LJSpeech LJ039-0200 50 male 1002.3143310546875 0.47214874625205994 0.014967765748031497 neutral And one of these agents, Robert A. Frazier, LJSpeech LJ028-0481 59 male 761.4593505859375 0.7362769842147827 0.027128444881889764 happy Nabo-Polisar, the father, my begetter, built Imgur-Bel. LJSpeech LJ003-0245 68 male 855.0972290039062 0.8284653425216675 0.027128444881889764 disgusted preparatory to their appearance in the Old Bailey. Irons were seldom removed. LJSpeech LJ017-0261 55 male 1154.7520751953125 0.7629727721214294 0.027128444881889764 neutral Seven were found guilty of murder on the high seas and one, Carlos, a criminal. LJSpeech LJ007-0137 62 male 1012.0001220703125 0.8231872320175171 0.021805775316455698 neutral More attention to ventilation, which was altogether neglected and inadequate, would LJSpeech LJ041-0017 57 male 845.4912719726562 0.8081146478652954 0.021805775316455698 neutral Wobel also recalled that Oswald once outlined a plan to cut the glass in the LJSpeech LJ043-0061 65 male 1211.890625 0.7867423295974731 0.021805775316455698 neutral At the time of his defection, when he evidenced no interest in his father and LJSpeech LJ033-0097 60 male 961.7119750976562 0.8147653937339783 0.021805775316455698 neutral Frazier parked the car in the company parking lot about two blocks north of the depository LJSpeech LJ019-0234 60 male 982.4412841796875 0.8467894792556763 0.0275625 sad The old buildings were entirely disused, and the whole of the inmates of Newgate were LJSpeech LJ016-0296 54 male 1260.2855224609375 0.4604700207710266 0.01482325 neutral The actual execution made some impression. LJSpeech LJ023-0072 69 male 1206.0687255859375 0.7865973711013794 0.0275625 neutral Congress passed a statute which, in 1803, the courts LJSpeech LJ050-0100 67 male 934.5872192382812 0.8466907143592834 0.0275625 neutral The Commission recognizes that no set of meaningful criteria will yield LJSpeech LJ028-0113 63 male 1025.2725830078125 0.6758575439453125 0.018996001683501684 neutral With mortar and bricks, he built two moat walls about the city. LJSpeech LJ013-0121 65 male 1238.6756591796875 0.7939990162849426 0.023200757575757576 sad Howe and his accomplice were arrested. The former was found guilty and sentenced LJSpeech LJ014-0273 63 male 1042.2625732421875 0.8731350302696228 0.023200757575757576 neutral The detection of these frauds came while he was still prominently before the world as LJSpeech LJ027-0136 63 male 1052.698974609375 0.8104369044303894 0.023200757575757576 neutral Illustrations quoted from the works of Romains and Locante will make this principle LJSpeech LJ003-0067 52 male 1101.8662109375 0.4360215961933136 0.011485659246575342 neutral in the then existing state of the law, LJSpeech LJ039-0220 65 male 1079.593994140625 0.8265351057052612 0.023598030821917807 neutral and that one would not have to be an expert marksman to have accomplished the assassination. LJSpeech LJ018-0029 63 male 892.7648315429688 0.8415856957435608 0.023598030821917807 neutral A photograph of Mueller shown the jeweler was identified as the likeness of the jewel. LJSpeech LJ001-0080 70 male 945.280517578125 0.8912354707717896 0.023598030821917807 neutral He seems to have taken the letter of the Elseviers of the 17th century for his mom. LJSpeech LJ028-0454 59 male 1161.063720703125 0.4748651683330536 0.01538449074074074 neutral But both sections originally reached the river. LJSpeech LJ037-0244 61 male 1057.122802734375 0.8424121737480164 0.025520833333333333 neutral Westbrook identified Commission Exhibit 162 as the light-colored LJSpeech LJ008-0174 64 male 970.2979125976562 0.8039829730987549 0.025520833333333333 sad One cartload of spectators having broken down, some of its occupants fell LJSpeech LJ009-0302 62 male 1106.8897705078125 0.7679134011268616 0.025520833333333333 sad Another man was hired, himself a convict, whose fees for self and wife were LJSpeech LJ002-0182 60 male 1128.78076171875 0.7411836981773376 0.021869359205776172 neutral the fleet and the Marshall Sea prisons especially devoted to them. LJSpeech LJ048-0023 61 male 983.9628295898438 0.8298522233963013 0.024875902527075812 neutral and he had told us during one of the interviews that he would probably take his wife back to LJSpeech LJ019-0257 65 male 1225.7010498046875 0.7593720555305481 0.024875902527075812 sad Here, the tread wheel was in use. There, cellular cranks. LJSpeech LJ040-0046 67 male 991.2825927734375 0.7420750856399536 0.024875902527075812 neutral which one associate described as, quote, irrevocable, end quote. LJSpeech LJ006-0295 60 male 1199.44775390625 0.8367014527320862 0.02745268924302789 neutral The governor was also personally responsible for gross contravention of this rule of LJSpeech LJ028-0252 51 male 1160.4306640625 0.45107951760292053 0.014317978087649402 neutral Only the king's son, Belshazzar, was killed. LJSpeech LJ035-0002 64 male 921.927734375 0.7000311613082886 0.02745268924302789 neutral Chapter 4 The Assassin Part 4 Oswald's Actions LJSpeech LJ009-0189 65 male 1282.5374755859375 0.8760916590690613 0.02745268924302789 neutral Persons were still living in 1855 who had witnessed dissections at Hicks Hall. LJSpeech LJ011-0032 61 male 919.7144165039062 0.8366391062736511 0.02452179715302491 neutral declared that they had hitherto formed a high opinion of his honor, integrity, and LJSpeech LJ042-0155 61 male 1077.17431640625 0.6541056036949158 0.018654137010676156 neutral It appears to be the work of a fairly well-organized person. LJSpeech LJ004-0039 55 male 796.300537109375 0.7321549654006958 0.02127335409252669 sad They were hopeless of any general reform by the action of the executive alone. LJSpeech LJ025-0157 61 male 1216.162353515625 0.8438274264335632 0.02452179715302491 neutral under these circumstances, unnatural as they are, with proper management LJSpeech LJ030-0121 56 male 970.6068725585938 0.7965176701545715 0.02469758064516129 happy Several times, Special Agent John D. Reddy came forward from the right front LJSpeech LJ010-0135 60 male 1289.2467041015625 0.5666248798370361 0.019533378136200718 happy yelled out three cheers to the populace whom he faced. LJSpeech LJ050-0118 67 male 922.9232788085938 0.8592643141746521 0.02469758064516129 neutral Since these agencies are already obliged constantly to evaluate the activities LJSpeech LJ040-0175 60 male 1017.7406005859375 0.7815579175949097 0.02331832437275986 neutral through which they could not reach him, but that he preferred the veil to remain intact. LJSpeech LJ013-0042 62 male 1047.09814453125 0.8341838717460632 0.023279138513513514 neutral the foundations of which had been laid by buying old ships on purpose to cast the more LJSpeech LJ014-0076 61 male 1151.91357421875 0.835938036441803 0.023279138513513514 happy He was seen afterwards smoking and talking with his hosts in their back parlor. LJSpeech LJ008-0097 64 male 1172.0582275390625 0.7829859852790833 0.023279138513513514 sad No sooner was the job finished than half a dozen competitors appeared. LJSpeech LJ043-0166 64 male 824.7908325195312 0.7988844513893127 0.023279138513513514 neutral Possibly he might have wanted to be caught and wanted his involvement made clear LJSpeech LJ045-0127 64 male 1247.29638671875 0.8624638915061951 0.023122902684563757 neutral He absolved the Soviet embassy in Mexico City of any blame for his difficulties. LJSpeech LJ027-0117 65 male 964.4253540039062 0.79942786693573 0.023122902684563757 happy Now, here again the former theory appears to be triumphant over the latter. LJSpeech LJ035-0076 63 male 1200.859130859375 0.8349494338035583 0.023122902684563757 happy Special Agent John Howlett of the Secret Service carried a rifle from the south LJSpeech LJ005-0101 58 male 1129.3095703125 0.7242733836174011 0.0196302432885906 sad Quince it deduced the practice and condition of every prison that replied. LJSpeech LJ028-0302 64 male 1078.13818359375 0.7867964506149292 0.027235671936758892 happy I will desert to the enemy as I am, and when I get into their city, LJSpeech LJ036-0196 65 male 1095.0950927734375 0.803264319896698 0.027235671936758892 neutral Tippett patrolled District 78 in the Oak Cliff area of Dixie. LJSpeech LJ028-0191 60 male 1061.2279052734375 0.42850354313850403 0.013888586956521738 happy The old enemies of Babylon rejoiced. LJSpeech LJ038-0214 57 male 1284.11328125 0.8420044779777527 0.027235671936758892 neutral and the other paragraphs instructed her on the disposal of Oswald's personal effects LJSpeech LJ003-0304 65 male 832.8008422851562 0.8691624999046326 0.023760775862068966 neutral with the penalty of forfeiting the day's allowance of food, an increase of which the committee LJSpeech LJ019-0009 56 male 1075.1240234375 0.7964742183685303 0.02298556034482759 neutral or to insist upon the construction of prisons on the most approved plan. LJSpeech LJ022-0192 57 male 1077.5343017578125 0.5187281370162964 0.014764870689655173 neutral They contemplate the enrichment of our national life. LJSpeech LJ007-0026 58 male 1158.8209228515625 0.793979287147522 0.023760775862068966 sad And with all this, the most dreadful oaths, the worst language, too bad to be repeated. LJSpeech LJ035-0158 66 male 997.3316040039062 0.8228869438171387 0.023679123711340205 neutral A blue jacket, later identified by Marina Oswald as her husband's, was LJSpeech LJ033-0014 54 male 959.2794799804688 0.8163527250289917 0.023679123711340205 sad Lee Harvey Oswald lived in a rooming house in Dallas, while his wife and children lived LJSpeech LJ018-0369 66 male 1060.0634765625 0.7843106389045715 0.023679123711340205 neutral the mysterious Bravo case, that of Dr. Lamson, and that of LJSpeech LJ017-0028 61 male 1008.4209594726562 0.8143091797828674 0.023679123711340205 neutral that she had had a quarrel with her mistress, and that the latter, with all others, LJSpeech LJ004-0109 57 male 1024.0537109375 0.4420858919620514 0.013042491007194245 neutral according to their categories or crimes. LJSpeech LJ011-0239 63 male 950.5305786132812 0.8259799480438232 0.02478642086330935 neutral Where robbery with violence was intended, the perpetrators had now to adopt various LJSpeech LJ049-0034 62 male 1265.04345703125 0.8034409284591675 0.02478642086330935 surprised could have reached the President in time to protect him from the second and fatal shot LJSpeech LJ016-0389 65 male 846.389892578125 0.8337498307228088 0.024438174460431655 neutral Giovanni Lanni, the Italian boy who murdered a French woman in the hay market. LJSpeech LJ037-0018 58 male 978.4235229492188 0.7971556186676025 0.029197563559322032 sad Skaggins saw him and heard him mutter either, Poor damn cop or... LJSpeech LJ041-0163 65 male 961.3748779296875 0.867605984210968 0.029197563559322032 neutral Out of a combination of Oswald's known Marxist sympathies and George Orwell's LJSpeech LJ017-0178 61 male 1041.3836669921875 0.7767281532287598 0.029197563559322032 surprised It appeared that several persons with whom she was intimate had succumbed suddenly. LJSpeech LJ048-0006 54 male 986.5155639648438 0.377620667219162 0.014482256355932204 neutral or to the Vice President." LJSpeech LJ002-0184 64 male 914.2891845703125 0.8724705576896667 0.025333180147058824 neutral the latter two being also a prison for felons and vagrants arrested within certain LJSpeech LJ012-0143 65 male 1187.48974609375 0.8944374322891235 0.025333180147058824 neutral Money Moses had received the stolen gold dust from Moss's father-in-law, Davis. LJSpeech LJ023-0012 59 male 1082.41943359375 0.7502818703651428 0.025333180147058824 neutral But when almost two years later it came before the Supreme LJSpeech LJ036-0068 63 male 865.4218139648438 0.6849870085716248 0.02191842830882353 sad Both buses stopped within one block of the depository building. LJSpeech LJ046-0158 66 male 1267.1650390625 0.8534243106842041 0.02222782258064516 neutral At the time of the assassination, the active PRS general files contained LJSpeech LJ011-0287 60 male 1136.369384765625 0.8295741677284241 0.02222782258064516 neutral but at length managed to wriggle out of the chain which confined his body and LJSpeech LJ016-0399 65 male 1150.46533203125 0.8492998480796814 0.02222782258064516 neutral Wainwright was allowed a cigar the night before execution, which he smoked in the prison yard. LJSpeech LJ006-0012 61 male 944.3914794921875 0.8449038863182068 0.02222782258064516 neutral but he would not arm them with any authority lest their cooperation might be offensive. LJSpeech LJ005-0102 62 male 1208.780029296875 0.8509426712989807 0.02194466560509554 neutral Upon these and the private visitations made by various members, the Society LJSpeech LJ001-0169 66 male 1138.5301513671875 0.825796365737915 0.02194466560509554 neutral The paper used for printing the small, highly ornamented French service books. LJSpeech LJ011-0271 66 male 1177.6123046875 0.8682052493095398 0.02194466560509554 neutral Mr. G went in the coach sent for him and alighted at 27 York Street, LJSpeech LJ014-0057 65 male 1001.4357299804688 0.8337165117263794 0.02194466560509554 neutral London did not escape the contagion and, prominent among the detestable crimes of LJSpeech LJ008-0191 65 male 960.4435424804688 0.7966058254241943 0.025904605263157895 disgusted At Courvoisier's execution in 1840, it was the same, or worse. LJSpeech LJ007-0120 60 male 1031.863525390625 0.5902575850486755 0.018803806390977444 sad some to the infirmary, many more to the governor's house. LJSpeech LJ012-0268 56 male 1237.5946044921875 0.7429749965667725 0.023254934210526317 neutral Suspicion grew almost to certainty as the evidence was unfolded. LJSpeech LJ015-0298 70 male 1055.9366455078125 0.7635851502418518 0.025904605263157895 happy But while Hardwick was in communication with Sawward, the bank was in communication with LJSpeech LJ030-0213 62 male 1213.4908447265625 0.7537742257118225 0.02620009505703422 neutral Hill heard a second shot, approximately five seconds after the first, which removed LJSpeech LJ009-0121 61 male 1158.9774169921875 0.7147454023361206 0.02333769011406844 happy for all thy goodness and loving kindness to us and to all men. LJSpeech LJ037-0084 63 male 1030.7886962890625 0.796912431716919 0.02620009505703422 neutral Barbara Jeanette Davis testified that no one had shown her a picture of Oswald before. LJSpeech LJ009-0219 56 male 1173.5 0.4054138660430908 0.01342134030418251 happy A clause was inserted to the effect that LJSpeech LJ040-0143 64 male 1050.6236572265625 0.8078874945640564 0.023598030821917807 neutral Contrary to reports that appeared after the assassination, the psychiatric examiner LJSpeech LJ006-0048 62 male 1121.3856201171875 0.8324679732322693 0.023598030821917807 neutral To these were still added an average of about 50, expecting the last penalty. LJSpeech LJ033-0006 58 male 1046.85986328125 0.5671167969703674 0.01674593321917808 neutral In this connection, the Commission considered one LJSpeech LJ044-0207 64 male 804.4202270507812 0.8754435181617737 0.023598030821917807 neutral the economic embargo against that country, and the general policy of the United States. LJSpeech LJ032-0260 62 male 1013.1453247070312 0.8036773204803467 0.023598030821917807 neutral He thought it contained tent poles or possibly other camping equipment, such as LJSpeech LJ017-0030 63 male 1237.06982421875 0.842721164226532 0.023598030821917807 happy When the spread of scientific knowledge places nefarious means at the disposal of LJSpeech LJ005-0084 62 male 1217.72900390625 0.7495614290237427 0.023598030821917807 neutral so as to prevent them from seeing, conversing, or holding any interaction. LJSpeech LJ008-0236 52 male 1014.2316284179688 0.7807179093360901 0.02271853595890411 neutral Coiled up on the floor of the scaffold like a serpent, the hangman's rope. LJSpeech LJ027-0020 60 male 1027.0858154296875 0.7961769104003906 0.026502403846153846 happy The unity in life, then, is not less a fact than is life's great diversity. LJSpeech LJ017-0238 66 male 1097.978515625 0.7882980704307556 0.026502403846153846 fearful Tefer the second mate agreed, but constantly went in fear of his life. LJSpeech LJ045-0233 66 male 1085.60400390625 0.800809919834137 0.026502403846153846 angry He consistently refused to admit involvement in the assassination LJSpeech LJ038-0099 48 male 939.3925170898438 0.5590772032737732 0.019237740384615384 neutral From the outset, Oswald denied owning a rifle. LJSpeech LJ028-0231 59 male 916.1470947265625 0.5505949854850769 0.01667814781021898 neutral whereupon they withdrew within their defenses. LJSpeech LJ044-0239 66 male 1262.919677734375 0.8477627635002136 0.025148266423357664 neutral and raises serious questions as to whether or not he ever expected to LJSpeech LJ050-0086 65 male 1170.1837158203125 0.8474505543708801 0.025148266423357664 neutral Under these criteria, whether the case should be referred to the Secret Service depends LJSpeech LJ047-0021 66 male 1138.4693603515625 0.8763906955718994 0.025148266423357664 neutral information regarding his relations with the U.S. Embassy in Moscow and background ================================================ FILE: README.md ================================================ # SpeechCraft This is the official repository of the ACM Multimedia 2024 paper *"SpeechCraft: A Fine-Grained Expressive Speech Dataset with Natural Language Description"*. For details of the pipeline and dataset, please refer to our [Paper](http://arxiv.org/abs/2408.13608) and [Demo Page](https://speechcraft2024.github.io/speechcraft2024/) ## News [2024-09-26]: **Structured metadata** (pitch, energy, speed, age, gender, emotion tone, emphasis, topic/category, and transcript) has been made available to facilitate further enhancements and augmentations of the dataset. [2024-12-20]: Code and checkpoint of the **Annotation pipeline** are released. ## SpeechCraft Dataset ### 1. Download Speech Corpus |Language|Speech Corpus|#Duration|#Clips| |:--------:|:--------:|--------:|--------:| |ZH|[Zhvoice](https://github.com/fighting41love/zhvoice)|799.68h|1,020,427| |ZH|[AISHELL-3](https://www.openslr.org/93/)|63.70h|63,011| |EN|[GigaSpeech-M](https://huggingface.co/datasets/speechcolab/gigaspeech/tree/main/data/audio/m_files_additional)|739.91h|670,070| |EN|[LibriTTS-R](https://www.openslr.org/141/)|548.88h|352,265| ### 2. Download Speech Annotation ||Description|Instruction|Labels| |:--------:|:--------:|:--------:|:--------:| |ZH|[download](https://cloud.tsinghua.edu.cn/f/e66664542f534f399802/?dl=1)|[download](https://cloud.tsinghua.edu.cn/f/d6f00e027f504751b4c0/?dl=1)|[download](https://cloud.tsinghua.edu.cn/f/02a69d7c862e4422850e/?dl=1)| |EN|[download](https://cloud.tsinghua.edu.cn/f/517428835bd5486e87e8/?dl=1)|[download](https://cloud.tsinghua.edu.cn/f/cce83dd884ed4104b1a1/?dl=1)|[download](https://cloud.tsinghua.edu.cn/f/6f05dcbcfb384ea1870b/?dl=1)| ### 3. Labels and Prompts #### EN Version - `--gender`: Male, Female - `--age`: Child, Teenager, Youth adult, Middle-aged, Elderly - `--pitch`: low, normal, high - `--speed`: slow, normal, fast - `--volume`: low, normal, high - `--emotion (English)`: Fearful, Happy, Disgusted, Sad, Surprised, Angry, Neutral - `--emphasis`: Non-label words - `--transcript`: Non-label sentence - `--LLM Prompt`: ```Given the pitch, volume, age, gender, tone, and transcript, use sentiment analysis techniques to describe in natural language what age, what gender of a person, with what kind of emotion and tone, using what kind of pitch and volume, spoke the words in the transcript. Note: You must vividly describe the sentence’s intonation, pitch, tone, and emotion. All outputs must strictly avoid identical wording and sentence structure. There is no need to describe body language or psychological state and do not repeat the input content. Refer to the format of the following four cases: *Example Input - Example Output* Now try to process the following sentences, directly output the converted sentences according to the examples without missing any labels. ``` #### ZH Version - `--年龄`:儿童,少年,青年,中年,老年 - `--性别`:男,女 - `--语速`:快,中,慢 - `--音高`:高,中,低 - `--音量`:高,中,低 - `--重读`:无标签,字词 - `--语气`:无标签,自然语句 - `--文本`:无标签,自然语句 - `--LLM Prompt`: ``` 请参照以下转换案例,使用中文自然语言描述一个人按照给定风格属性,如音高、音量、年龄、性别、语调,来说文本中的话。注意,仅描述说话风格,不需要描述肢体动作或心理状态,不要重复输入的内容。 *示例输入-示例输出* 现在尝试处理以下句子,根据示例直接输出转换后的句子,不要遗漏任何标签。 ``` ### 4. Request Access to Emphasis Speech Dataset Since we do not own the copyright of the original audio files, for researchers and educators who wish to use the audio files for non-commercial research and/or educational purposes, we can provide access to our regenerated version under certain conditions and terms. To apply for the AISHELL-3 and LibriTTS-R with fine-grained keyword emphasis, please fill out the EULA form at `Emphasis-SpeechCraft-EULA.pdf` and send the scanned form to jinzeyu23@mails.tsinghua.edu.cn. Once approved, you will be supplied with a download link. **([2024-09-26]: With metadata updated!)** Please first refer to some emphasis examples provided [here](https://speechcraft2024.github.io/speechcraft2024/#13-examples-of-the-regenerated-emphasis-data-from-aishell-3-and-libritts-r). We are actively working on improving methods for large-scale fine-grained data construction that align with human perception. |Language|Speech Corpus|#Duration|#Clips| |:--------:|:--------:|--------:|--------:| |ZH|AISHELL-3-stress|50.59h|63,258| |EN|LibriTTS-R-stress|148.78h|75,654| ## Annotation Pipeline ### Step 0 : Installation 1. Download models for speech style recognition. Models from 🤗: ``` llama_base_model = "baichuan-inc/Baichuan2-13B-Base" gender_model_path = "alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech" age_model_path = "audeering/wav2vec2-large-robust-24-ft-age-gender" asr_path = "openai/whisper-medium" / "openai/whisper-large-v3" ``` Model from funasr (for English emotion classification): ``` emotion_model = "iic/emotion2vec_base_finetuned" ``` Prepare SECap from [here](https://github.com/thuhcsi/SECap) (for Chinese emotion captioning). 2. Create conda environment ``` conda env create -f ./requirements.yaml mv ./AutomaticPipeline/models/SECap/model2.py $your_SECap_dir ``` 3. Download the lora ckpt from [here](https://cloud.tsinghua.edu.cn/d/548948399d7c4816b677/) as `./llama-ft/finetuned-llama/` for description rewriting. Remember to change the path of LLM ckpt at "`base_model_name_or_path`" in `./llama-ft/finetuned-llama/adapter_config.json`. ### Step 1 : Labeling with the Automatic Annotation Pipeline 1. Get the scp file with raw scores for the audio corpus. ``` cd ./AutomaticPipeline python AutoPipeline.py ``` 2. Get the json file with classified result prepared for the description rewriting. ``` python Clustering.py ``` ### Step 2 : Rewriting with the Finetuned Llama ``` cd ../llama-ft python llama_infer.py ``` ## Citation Please cite our paper if you find this work useful: ``` @inproceedings{jin2024speechcraft, title={Speechcraft: A fine-grained expressive speech dataset with natural language description}, author={Jin, Zeyu and Jia, Jia and Wang, Qixin and Li, Kehan and Zhou, Shuoyi and Zhou, Songtao and Qin, Xiaoyu and Wu, Zhiyong}, booktitle={Proceedings of the 32nd ACM International Conference on Multimedia}, pages={1255--1264}, year={2024} } ``` ================================================ FILE: llama-ft/llama_infer.py ================================================ import os import torch import argparse from accelerate import Accelerator from transformers import AutoModelForCausalLM, AutoTokenizer from peft import AutoPeftModelForCausalLM import json from tqdm import tqdm import random import torch.nn as nn import torch.distributed as dist from torch.utils.data import DataLoader, Subset torch.multiprocessing.set_start_method('spawn', force=True) PROMPT_DICT = { "prompt_input": ( "Below is an instruction that describes a task, paired with an input that provides further context. " "Write a response that appropriately completes the request.\n\n" "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:" ), "prompt_no_input": ( "Below is an instruction that describes a task. " "Write a response that appropriately completes the request.\n\n" "### Instruction:\n{instruction}\n\n### Response:" ), "chinese": ( "Below is an instruction that describes a task. " "Write a response that appropriately completes the request.\n\n" "### Instruction:\n给定音高、音量、年龄、性别、语气等信息以及文本,运用情感分析的技巧,用中文自然语言描述。" "注意必须生动且多样化地描述,不需要描述肢体动作或心理状态,不要重复input内容。\n\n" "###Input:\n{labels}\n\n### Response:" ), "english": ( "Below is an instruction that describes a task. " "Write a response that appropriately completes the request.\n\n" "### Instruction:\nGiven information such as pitch, volume, age, gender, tone, category and text, describe using English natural language with the techniques of emotional analysis." "Make sure the description is vivid and diverse, without the need to describe physical actions or psychological states, and avoid repeating the input content.\n\n" "###Input:\n{labels}\n\n### Response:" ) } class Dataset(torch.utils.data.Dataset): def __init__(self, args): self.language = args.language self.ckpt_path = args.ckpt_path self.testdata = json.load(open(args.json_path)) self.keyindex = list(self.testdata.keys()) self.tokenizer = AutoTokenizer.from_pretrained(args.ckpt_path, use_fast=False, trust_remote_code=True) def __len__(self): return len(self.testdata) def __getitem__(self, index): user_tokens=[195] assistant_tokens=[196] key = self.keyindex[index] value = self.testdata[key] tags = value['labels'].strip().split('\t') # shuffle tags random.shuffle(tags) value['labels'] = '\t'.join(tags) prompt = '' + PROMPT_DICT[self.language].format_map(value) + '' inputs = self.tokenizer(prompt, return_tensors='pt') return inputs, key, prompt, value['labels'] def extract(input_text, language): if language=='chinese': if "“" not in input_text: return None else: start_index = input_text.find("“") end_index = input_text.find("”", start_index+1) return input_text[start_index:end_index] else: if input_text.count("\"") < 2: return None else: start_index = input_text.find("\"") end_index = input_text.rfind("\"") return input_text[start_index+1:end_index] def inference_on_device(args, tokenizer, device, dataloader): language = args.language output_path = args.output_path error_path = args.error_path ckpt_path = args.ckpt_path model = AutoPeftModelForCausalLM.from_pretrained( ckpt_path, revision="v2.0", trust_remote_code=True, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", ) # model = model.quantize(4) model.to(device) top_p = 0.3 temperature = 0.7 count = 0 test_result = {} error = {} output_path = output_path[:-4]+str(device)+'.json' with torch.no_grad(): for inputs, keys, prompts, labels in tqdm(dataloader): # unwrap the input inputs = {key: value[0].to(device, dtype=torch.long) for key, value in inputs.items()} # inputs = {key: value[0].to(device, dtype=torch.bfloat16) for key, value in inputs.items()} key = keys[0] prompt = prompts[0] info = {} labels = labels[0] try: pred = model.sample(**inputs, max_new_tokens=128, repetition_penalty=1.1, do_sample=True, use_cache=True, temperature=temperature, top_p=top_p) generation = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)[len(prompt):] ct = 0 while (extract(generation, language) is None or (len(generation)-len(extract(generation, language))<3)) and ct<5: pred = model.sample(**inputs, max_new_tokens=128, repetition_penalty=1.1, do_sample=True, use_cache=True, temperature=temperature, top_p=top_p) generation = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)[len(prompt):] ct+=1 if ct==5: with open(error_path, 'a') as file: file.write(key+'\n') continue info['labels'] = labels info['llama-instruction'] = generation test_result[key] = info count+=1 except Exception as e: with open(error_path, 'a') as file: file.write(key+'\n') # raise e if count % 1000 ==0 or count==50: json.dump(test_result, open(output_path, 'w'), indent=4, ensure_ascii=False) json.dump(test_result, open(output_path, 'w'), indent=4, ensure_ascii=False) def main(args): devices = list(map(int, args.devices.split(','))) tokenizer = AutoTokenizer.from_pretrained(args.ckpt_path, revision="v2.0", use_fast=False, trust_remote_code=True) inferset = Dataset(args) num_devices = len(devices) subset_size = len(inferset) // num_devices subsets = [ Subset(inferset, range(i * subset_size, (i + 1) * subset_size)) for i in range(num_devices) ] data_loaders = [ DataLoader(subset, batch_size=1) for subset in subsets ] processes = [] for i in range(num_devices): device_num = devices[i] device = torch.device(f'cuda:{device_num}') # device = torch.device('cpu') p = torch.multiprocessing.Process(target=inference_on_device, args=(args, tokenizer, device, data_loaders[i])) p.start() processes.append(p) for p in processes: p.join() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--language', type=str, default = 'english') parser.add_argument('--devices', type=str, default = '0') parser.add_argument('--json_path', type=str, default = './example_libritts.json') parser.add_argument('--output_path', type=str, default = './inference_libritts.json') parser.add_argument('--error_path', type=str, default = './error.txt') parser.add_argument('--ckpt_path', type=str, default = './finetuned-llama') args = parser.parse_args() main(args) ================================================ FILE: requirements.yaml ================================================ name: speechcraft channels: - http://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch/ - http://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/ - http://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ - simpleitk - pytorch - menpo - msys2 - nvidia - defaults - bioconda - conda-forge dependencies: - _libgcc_mutex=0.1=conda_forge - _openmp_mutex=4.5=2_gnu - bzip2=1.0.8=h5eee18b_6 - ca-certificates=2024.11.26=h06a4308_0 - ld_impl_linux-64=2.40=h12ee557_0 - libffi=3.4.4=h6a678d5_1 - libgcc=14.2.0=h77fa898_1 - libgcc-ng=14.2.0=h69a702a_1 - libgomp=14.2.0=h77fa898_1 - libnsl=2.0.1=hd590300_0 - libsqlite=3.47.0=hadc24fc_1 - libstdcxx-ng=11.2.0=h1234567_1 - libuuid=2.38.1=h0b41bf4_0 - libxcrypt=4.4.36=hd590300_1 - libzlib=1.3.1=hb9d3cd8_2 - ncurses=6.4=h6a678d5_0 - openssl=3.4.0=hb9d3cd8_0 - python=3.9.18=h0755675_1_cpython - readline=8.2=h5eee18b_0 - setuptools=75.1.0=py39h06a4308_0 - tk=8.6.13=noxft_h4845f30_101 - wheel=0.44.0=py39h06a4308_0 - xz=5.4.6=h5eee18b_1 - pip: - absl-py==2.1.0 - accelerate==0.28.0 - aiohappyeyeballs==2.4.4 - aiohttp==3.11.10 - aiosignal==1.3.1 - aliyun-python-sdk-core==2.16.0 - aliyun-python-sdk-kms==2.16.5 - annotated-types==0.7.0 - antlr4-python3-runtime==4.9.3 - anyio==4.7.0 - argparse==1.4.0 - asgiref==3.8.1 - async-timeout==4.0.3 - asyncio==3.4.3 - attrs==24.2.0 - audioread==3.0.1 - backoff==2.2.1 - bcrypt==4.2.1 - beartype==0.17.2 - beautifulsoup4==4.12.3 - bitarray==3.0.0 - bitsandbytes==0.43.0 - black==24.10.0 - build==1.2.2.post1 - cachetools==5.5.0 - certifi==2024.8.30 - cffi==1.17.1 - charset-normalizer==3.4.0 - chroma-hnswlib==0.7.6 - chromadb==0.5.23 - click==8.1.7 - cohere==5.1.8 - colorama==0.4.6 - coloredlogs==15.0.1 - colt5-attention==0.10.20 - contourpy==1.3.0 - crcmod==1.7 - cryptography==44.0.0 - cycler==0.12.1 - cython==3.0.11 - dalle3==0.1.0 - dataclasses-json==0.6.7 - datasets==2.19.2 - decorator==5.1.1 - deprecated==1.2.15 - diffusers==0.31.0 - dill==0.3.8 - distance==0.1.3 - docstring-parser==0.16 - duckduckgo-search==6.4.1 - durationpy==0.9 - editdistance==0.8.1 - einops==0.7.0 - einops-exts==0.0.4 - einx==0.3.0 - encodec==0.1.1 - exceptiongroup==1.2.2 - fairscale==0.4.13 - faiss-cpu==1.9.0.post1 - fastapi==0.115.6 - fastavro==1.9.7 - filelock==3.13.1 - flake8==7.1.1 - flash-attn==2.3.6 - flatbuffers==24.3.25 - fonttools==4.55.2 - frozendict==2.4.6 - frozenlist==1.5.0 - fsspec==2024.2.0 - ftfy==6.3.1 - funasr==1.1.16 - g2p-en==2.1.0 - ggl==1.1.0 - google-ai-generativelanguage==0.6.10 - google-api-core==2.24.0 - google-api-python-client==2.155.0 - google-auth==2.37.0 - google-auth-httplib2==0.2.0 - google-generativeai==0.8.3 - googleapis-common-protos==1.66.0 - greenlet==3.1.1 - grpcio==1.68.1 - grpcio-status==1.68.1 - h11==0.14.0 - httpcore==1.0.7 - httplib2==0.22.0 - httptools==0.6.4 - httpx==0.28.1 - huggingface-hub==0.26.3 - humanfriendly==10.0 - hydra-core==1.3.2 - idna==3.10 - importlib-metadata==8.5.0 - importlib-resources==6.4.5 - inflect==7.4.0 - iniconfig==2.0.0 - jaconv==0.4.0 - jamo==0.4.1 - jieba==0.42.1 - jinja2==3.1.3 - jmespath==0.10.0 - joblib==1.4.2 - jsonpatch==1.33 - jsonpointer==3.0.0 - kaldiio==2.18.0 - kiwisolver==1.4.7 - kubernetes==31.0.0 - langchain==0.1.13 - langchain-community==0.0.29 - langchain-core==0.1.53 - langchain-experimental==0.0.55 - langchain-text-splitters==0.0.2 - langsmith==0.1.147 - lazy-loader==0.4 - libcst==1.5.1 - librosa==0.10.2.post1 - lightning==2.4.0 - lightning-utilities==0.11.9 - lion-pytorch==0.2.3 - llvmlite==0.43.0 - local-attention==1.9.3 - loguru==0.7.2 - lxml==5.3.0 - markdown==3.7 - markdown-it-py==3.0.0 - markupsafe==2.1.5 - marshmallow==3.23.1 - matplotlib==3.9.3 - mccabe==0.7.0 - mdurl==0.1.2 - mmh3==5.0.1 - modelscope==1.21.0 - monotonic==1.6 - more-itertools==10.5.0 - mpmath==1.3.0 - msgpack==1.1.0 - multidict==6.1.0 - multiprocess==0.70.16 - mypy-extensions==1.0.0 - nest-asyncio==1.6.0 - networkx==3.2.1 - ninja==1.11.1.2 - nltk==3.9.1 - numba==0.60.0 - numpy==1.25.2 - nvidia-cublas-cu12==12.1.3.1 - nvidia-cuda-cupti-cu12==12.1.105 - nvidia-cuda-nvrtc-cu12==12.1.105 - nvidia-cuda-runtime-cu12==12.1.105 - nvidia-cudnn-cu12==8.9.2.26 - nvidia-cufft-cu12==11.0.2.54 - nvidia-curand-cu12==10.3.2.106 - nvidia-cusolver-cu12==11.4.5.107 - nvidia-cusparse-cu12==12.1.0.106 - nvidia-nccl-cu12==2.19.3 - nvidia-nvjitlink-cu12==12.4.127 - nvidia-nvtx-cu12==12.1.105 - oauthlib==3.2.2 - omegaconf==2.3.0 - onnxruntime==1.19.2 - open-clip-torch==2.29.0 - openai==0.28.0 - opencv-python==4.10.0.84 - opencv-python-headless==4.10.0.84 - opentelemetry-api==1.29.0 - opentelemetry-exporter-otlp-proto-common==1.29.0 - opentelemetry-exporter-otlp-proto-grpc==1.29.0 - opentelemetry-instrumentation==0.50b0 - opentelemetry-instrumentation-asgi==0.50b0 - opentelemetry-instrumentation-fastapi==0.50b0 - opentelemetry-proto==1.29.0 - opentelemetry-sdk==1.29.0 - opentelemetry-semantic-conventions==0.50b0 - opentelemetry-util-http==0.50b0 - orjson==3.10.12 - oss2==2.19.1 - outcome==1.3.0.post0 - overrides==7.7.0 - packaging==23.2 - pandas==2.2.3 - pathspec==0.12.1 - peft==0.13.2 - pillow==10.3.0 - pip==24.3.1 - platformdirs==4.3.6 - playwright==1.49.1 - pluggy==1.5.0 - pooch==1.8.2 - portalocker==3.0.0 - posthog==3.7.4 - primp==0.8.3 - propcache==0.2.1 - proto-plus==1.25.0 - protobuf==5.29.1 - psutil==6.1.0 - pyarrow==18.1.0 - pyarrow-hotfix==0.6 - pyasn1==0.6.1 - pyasn1-modules==0.4.1 - pycodestyle==2.12.1 - pycparser==2.22 - pycryptodome==3.21.0 - pydantic==2.7.1 - pydantic-core==2.18.2 - pydub==0.25.1 - pyee==12.0.0 - pyflakes==3.2.0 - pygments==2.18.0 - pynndescent==0.5.13 - pyparsing==3.2.0 - pypdf==4.1.0 - pypdf2==3.0.1 - pypika==0.48.9 - pyproject-hooks==1.2.0 - pysocks==1.7.1 - pytest==8.1.1 - python-dateutil==2.9.0.post0 - python-dotenv==1.0.1 - pytorch-lightning==2.4.0 - pytorch-wpe==0.0.1 - pytz==2024.2 - pyyaml==6.0.2 - qformer==0.0.5 - ratelimit==2.2.1 - regex==2024.11.6 - requests==2.32.3 - requests-oauthlib==2.0.0 - requests-toolbelt==1.0.0 - rich==13.7.1 - rsa==4.9 - sacrebleu==2.4.3 - safetensors==0.4.5 - scikit-learn==1.5.2 - scipy==1.9.3 - selenium==4.27.1 - sentencepiece==0.2.0 - sentry-sdk==2.19.2 - shellingham==1.5.4 - six==1.17.0 - sniffio==1.3.1 - sortedcontainers==2.4.0 - soundfile==0.12.1 - soupsieve==2.6 - soxr==0.5.0.post1 - sqlalchemy==2.0.36 - starlette==0.41.3 - swarms==2.4.0 - sympy==1.13.1 - tabulate==0.9.0 - tenacity==8.2.3 - tensorboard==2.18.0 - tensorboard-data-server==0.7.2 - tensorboardx==2.6.2.2 - termcolor==2.5.0 - threadpoolctl==3.5.0 - tiktoken==0.8.0 - timm==0.4.12 - tokenizers==0.13.3 - toml==0.10.2 - tomli==2.2.1 - torch==2.2.0 - torch-complex==0.4.4 - torchaudio==2.2.0 - torchfix==0.7.0 - torchmetrics==1.6.0 - torchvision==0.17.0 - tqdm==4.66.2 - transformers==4.33.3 - trio==0.27.0 - trio-websocket==0.11.1 - triton==2.2.0 - typeguard==4.4.1 - typer==0.15.1 - types-requests==2.32.0.20241016 - typing==3.7.4.3 - typing-extensions==4.12.2 - typing-inspect==0.9.0 - tzdata==2024.2 - umap-learn==0.5.7 - undetected-chromedriver==3.5.5 - uritemplate==4.1.1 - urllib3==2.2.3 - uvicorn==0.32.1 - uvloop==0.21.0 - vector-quantize-pytorch==1.14.5 - vocos==0.1.0 - watchfiles==1.0.3 - wcwidth==0.2.13 - websocket-client==1.8.0 - websockets==14.1 - werkzeug==3.1.3 - wget==3.2 - wrapt==1.17.0 - wsproto==1.2.0 - xxhash==3.5.0 - yarl==1.18.3 - zetascale==0.4.9 - zipp==3.21.0