ferritin_plms/esm/tokenization/mod.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
pub mod sequence_tokenizer;
use crate::esm::utils::constants::models::{normalize_model_name, ESM3_OPEN_SMALL};
use anyhow::{anyhow, Result};
use sequence_tokenizer::{EsmSequenceTokenizer, EsmTokenizerBase};
pub struct TokenizerCollection {
pub sequence: EsmSequenceTokenizer,
// pub structure: structure_tokenizer::StructureTokenizer,
// pub secondary_structure: ss_tokenizer::SecondaryStructureTokenizer,
// pub sasa: sasa_tokenizer::SASADiscretizingTokenizer,
// pub function: function_tokenizer::InterProQuantizedTokenizer,
// pub residue_annotations: residue_tokenizer::ResidueAnnotationsTokenizer,
}
pub fn get_model_tokenizers(model: &str) -> Result<TokenizerCollection> {
if normalize_model_name(model) == ESM3_OPEN_SMALL {
Ok(TokenizerCollection {
sequence: EsmSequenceTokenizer::default(),
// structure: structure_tokenizer::StructureTokenizer::new()?,
// secondary_structure: ss_tokenizer::SecondaryStructureTokenizer::new("ss8")?,
// sasa: sasa_tokenizer::SASADiscretizingTokenizer::new()?,
// function: function_tokenizer::InterProQuantizedTokenizer::new()?,
// residue_annotations: residue_tokenizer::ResidueAnnotationsTokenizer::new()?,
})
} else {
Err(anyhow!("Unknown model: {}", model))
}
}
// pub fn get_invalid_tokenizer_ids(tokenizer: &impl EsmTokenizerBase) -> Vec<i64> {
// if tokenizer.is_sequence_tokenizer() {
// vec![
// tokenizer.mask_token_id(),
// tokenizer.pad_token_id(),
// tokenizer.cls_token_id(),
// tokenizer.eos_token_id(),
// ]
// } else {
// vec![
// tokenizer.mask_token_id(),
// tokenizer.pad_token_id(),
// tokenizer.bos_token_id(),
// tokenizer.eos_token_id(),
// ]
// }
// }