ferritin_plms/esm/tokenization/
mod.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
pub mod sequence_tokenizer;
use crate::esm::utils::constants::models::{normalize_model_name, ESM3_OPEN_SMALL};
use anyhow::{anyhow, Result};
use sequence_tokenizer::{EsmSequenceTokenizer, EsmTokenizerBase};

pub struct TokenizerCollection {
    pub sequence: EsmSequenceTokenizer,
    // pub structure: structure_tokenizer::StructureTokenizer,
    // pub secondary_structure: ss_tokenizer::SecondaryStructureTokenizer,
    // pub sasa: sasa_tokenizer::SASADiscretizingTokenizer,
    // pub function: function_tokenizer::InterProQuantizedTokenizer,
    // pub residue_annotations: residue_tokenizer::ResidueAnnotationsTokenizer,
}

pub fn get_model_tokenizers(model: &str) -> Result<TokenizerCollection> {
    if normalize_model_name(model) == ESM3_OPEN_SMALL {
        Ok(TokenizerCollection {
            sequence: EsmSequenceTokenizer::default(),
            // structure: structure_tokenizer::StructureTokenizer::new()?,
            // secondary_structure: ss_tokenizer::SecondaryStructureTokenizer::new("ss8")?,
            // sasa: sasa_tokenizer::SASADiscretizingTokenizer::new()?,
            // function: function_tokenizer::InterProQuantizedTokenizer::new()?,
            // residue_annotations: residue_tokenizer::ResidueAnnotationsTokenizer::new()?,
        })
    } else {
        Err(anyhow!("Unknown model: {}", model))
    }
}

// pub fn get_invalid_tokenizer_ids(tokenizer: &impl EsmTokenizerBase) -> Vec<i64> {
//     if tokenizer.is_sequence_tokenizer() {
//         vec![
//             tokenizer.mask_token_id(),
//             tokenizer.pad_token_id(),
//             tokenizer.cls_token_id(),
//             tokenizer.eos_token_id(),
//         ]
//     } else {
//         vec![
//             tokenizer.mask_token_id(),
//             tokenizer.pad_token_id(),
//             tokenizer.bos_token_id(),
//             tokenizer.eos_token_id(),
//         ]
//     }
// }