classEvo2Preprocessor:"""Data preprocessing class for Evo2."""BIN=".bin"IDX=".idx"TRAIN="train"VAL="val"TEST="test"def__init__(self,params:Evo2PreprocessingConfig|None=None):"""Initialize Evo2Preprocessor. Args: params (Evo2PreprocessingConfig | None): Configuration parameters for preprocessing. """self.tokenizer:Evo2Tokenizer=Evo2Tokenizer(params)@staticmethod@contextmanagerdefpreprocessing_context_manager(seed:Optional[int]=None):"""Context manager for setting and restoring the random number generator state. Args: seed (int | None): Seed for the random number generator. Defaults to None. """# Track current state.current_state=random.getstate()try:# Set random seed.random.seed(seed)yieldseedfinally:# Restore random state.random.setstate(current_state)@staticmethoddef_get_output_filename(config:Evo2PreprocessingConfig,ext:Optional[str]=None,split:Optional[str]=None,temp:bool=False)->Path:"""Generate the output filename for the preprocessed data. Args: config (Evo2PreprocessingConfig): Configuration object containing preprocessing settings. ext (Optional[str]): File extension for the output file. Defaults to None. split (Optional[str]): Data split type (e.g., 'train', 'val', 'test'). Defaults to None. temp (bool): Flag indicating whether the file is temporary. Defaults to False. Returns: Path: The constructed output file path. """# Get output directory. Defaults to CWD.output_dir=config.output_dirifoutput_dirisNone:output_dir=Path.cwd()# Pickup output file prefix.config_prefix="{}_{}".format(config.output_prefix,config.tokenizer_type.lower().replace(" ",""))output_filepath=Path(output_dir)/(config_prefix+(f"_{split}"ifsplitisnotNoneelse"")+(extifextisnotNoneelse"")+(".tmp"iftempelse""))returnoutput_filepath@staticmethoddef_subsequence_generator(sequence:str,subsequence_length:Optional[int]=None,offset:Optional[int]=None):"""Generate subsequences from a given sequence. Args: sequence (str): The input sequence. subsequence_length (int | None): Length of each subsequence. Defaults to the length of the sequence. offset (int | None): Step size for generating subsequences. Defaults to subsequence_length. Yields: str: Subsequences of the input sequence. """subsequence_length=subsequence_lengthifsubsequence_lengthisnotNoneelselen(sequence)step_size=offsetifoffsetisnotNoneelsesubsequence_lengthforiinrange(0,len(sequence),step_size):yieldsequence[i:i+subsequence_length]@staticmethoddef_random_reverse_complement(seq:str,prob:float=0.0,seed:Optional[int]=None):"""Randomly reverse complements a DNA sequence based on a given probability. Args: seq (str): The DNA sequence to potentially reverse complement. prob (float): The probability of reverse complementing the sequence. Defaults to 0.0. seed (Optional[int]): The seed for the random number generator. Defaults to None. Returns: str: The original or reverse complemented DNA sequence based on the probability. """withEvo2Preprocessor.preprocessing_context_manager(seed):ifrandom.random()<prob:returncomplement_sequence(reverse_sequence(seq))else:returnseq@staticmethoddef_reverse_complement_expansion(seq:str):"""Generate a list containing the original and reverse complemented sequence. Args: seq (str): The input DNA sequence. Returns: list[str]: List containing the original and reverse complemented sequence. """return[seq,complement_sequence(reverse_sequence(seq))]@staticmethoddef_train_val_test_split(train_weight:float,val_weight:float,test_weight:float,seed:Optional[int]=None):"""Randomly assign a data point to train, validation, or test split based on provided weights. Args: train_weight (float): The weight for the training split. val_weight (float): The weight for the validation split. test_weight (float): The weight for the test split. seed (Optional[int]): The seed for the random number generator. Defaults to None. Returns: str: The split assignment ('train', 'val', or 'test'). Raises: ValueError: If the sum of the weights is zero or negative. """withEvo2Preprocessor.preprocessing_context_manager(seedifseedisnotNoneelseNone):# Generate random number.roll=random.random()# Rectify and normalize split ratios.total_weight=abs(train_weight)+abs(val_weight)+abs(test_weight)iftotal_weight<=0:raiseValueError("Train-validation-test split proportions cannot be zero.")train_split=abs(train_weight)/total_weighttest_split=abs(test_weight)/total_weightsplit="train"ifroll>train_split:ifroll<1-test_split:split="val"else:split="test"returnsplit@staticmethoddef_construct_taxonomy_token(lineage:Evo2TaxonomyLineage,dropout:float=0.0,seed:Optional[int]=None)->Optional[str]:"""Construct a special Taxonomy token for natural language prompting of DNA generation models. Args: lineage (Evo2TaxonomyLineage): The taxonomy lineage information. dropout (float): The probability of dropping out segments of the lineage. Defaults to 0.0. seed (Optional[int]): The seed for the random number generator. Defaults to None. Returns: Optional[str]: The constructed taxonomy token or None if lineage is None. """# If dropout > 0, randomly drop out segments of the lineage for training on incomplete lineages.withEvo2Preprocessor.preprocessing_context_manager(seedifseedisnotNoneelseNone):return("|d__{};p__{};c__{};o__{};f__{};g__{};s__{}|".format(lineage.domainifrandom.random()>=dropoutelseNone,lineage.phylumifrandom.random()>=dropoutelseNone,lineage.clazzifrandom.random()>=dropoutelseNone,lineage.orderifrandom.random()>=dropoutelseNone,lineage.familyifrandom.random()>=dropoutelseNone,lineage.genusifrandom.random()>=dropoutelseNone,lineage.speciesifrandom.random()>=dropoutelseNone,)iflineageisnotNoneelseNone)defpreprocess_data(self,filepath:str,seqid:str,seq:str,seq_idx:int,config:Evo2PreprocessingConfig):"""Preprocess fasta datapaths. Args: filepath (str): Path to the .fasta file. seqid (str): Sequence ID. seq (str): DNA sequence. seq_idx (int): Sequence index. config (Evo2PreprocessingConfig): Configuration object containing preprocessing settings. Returns: tuple[list[dict], float]: Preprocessed data and the time taken for preprocessing. """# Timing.start=time.time()# Retrieve taxonomy lineage string if SeqID has associated taxonomy data.# Note: Better implemented as a suffix tree substring dictionary, but convenient# for identifying a large amount of sequences with identical lineages.# Slow for extremely large dictionaries of (SeqID Substr, Taxonomy) pairs.lineage=Noneforid,taxinconfig.taxonomy_data.items():# Taxonomy ID is a substring of Seq ID.ifidinseqid:lineage=taxbreak# Preprocess data.preproc_data=[]withself.preprocessing_context_manager(config.seed+hash(filepath)+seq_idxifconfig.seedisnotNoneelseNone):# Randomly reverse complement the sequence.seq=self._random_reverse_complement(seq,prob=config.random_reverse_complement)seqs_to_parse=self._reverse_complement_expansion(seq)ifconfig.embed_reverse_complementelse[seq]forseqinseqs_to_parse:# Sequence Modifiersifconfig.force_uppercase:seq=seq.upper()ifconfig.transcribe=="transcribe":seq=transcribe_sequence(seq)elifconfig.transcribe=="back_transcribe":seq=back_transcribe_sequence(seq)ifconfig.drop_empty_sequencesandlen(seq)==0:continueifconfig.nnn_filterand"NNN"inseq.upper():continue# Construct taxonomy token with random dropout on the lineage categories per sequence.taxonomy_token=self._construct_taxonomy_token(lineage,dropout=config.random_lineage_dropout)# Inject taxonomy lineage tokens every prompt_spacer_length tokens in the sequence.# If the taxonomy lineage token is not provided, then just take the original sequence.target_length=(config.prompt_spacer_length-len(taxonomy_token)iftaxonomy_tokenisnotNoneelseNone)taxonomy_injected_sequence=[taxonomy_token+str(subseq)iftaxonomy_tokenisnotNoneelsestr(subseq)forsubseqinself._subsequence_generator(seq,target_length,target_length)]# Wrap and tokenize.preproc_data_record={"text":"".join(taxonomy_injected_sequence),}preproc_data_record["tokens"]=self.tokenizer.tokenize(preproc_data_record["text"],use_ftfy=config.ftfy,enforce_sample_length=config.enforce_sample_length,append_eod=config.append_eod,drop_empty_sequences=config.drop_empty_sequences,)preproc_data.append(preproc_data_record)end=time.time()returnpreproc_data,end-startdefpreprocess_data_task(self,file_sequence_config):"""Wrapper function to unpack args for preprocess_data. Args: file_sequence_config (tuple): Tuple containing arguments for preprocess_data. Returns: tuple[list[dict], float]: Preprocessed data and the time taken for preprocessing. """returnself.preprocess_data(*file_sequence_config)@staticmethoddef_yield_sequences_from_files(config:Evo2PreprocessingConfig,semaphore:Semaphore):"""Iterator over sequences within multiple input documents. Arguments for multiprocessing tasks. Utilized to limit the amount of sequences streamed into memory. Args: config (Evo2PreprocessingConfig): Configuration object containing preprocessing settings. semaphore (Semaphore): Semaphore to limit the number of sequences in memory. Yields: tuple: Arguments for preprocess_data. """defyielder(fname,semaphore):# Read FASTA.index=NvFaidx(fname)fori,(seqid,sequence)inenumerate(index.items()):semaphore.acquire()# Yield filename and sequence within fasta.yieldstr(fname),seqid,sequence,i,configforfnameinconfig.datapaths:semaphore.acquire()yield fromyielder(fname,semaphore)defpreprocess_generator(self,preproc_config:Evo2PreprocessingConfig):"""Main function to preprocess data for Evo2. Args: preproc_config (Evo2PreprocessingConfig): Configuration object containing preprocessing settings. Yields: tuple[dict, float]: Preprocessed sequence data and the time taken for preprocessing. """# Track which splits have been assignedsplit_assignments={"train":preproc_config.train_split>0,"val":preproc_config.valid_split>0,"test":preproc_config.test_split>0,}splits_needed={kfork,vinsplit_assignments.items()ifv}# Instantiate multiprocessing pool. Use semaphore to limit the amount of sequences to read into memory.semaphore=Semaphore(preproc_config.preproc_concurrency+preproc_config.workers)ifpreproc_config.workers>1:pool=mp.Pool(preproc_config.workers)# Ordered imap for downstream seeded splitting.preproc_tasks=pool.imap(self.preprocess_data_task,self._yield_sequences_from_files(preproc_config,semaphore),chunksize=preproc_config.chunksize,)else:preproc_tasks=(self.preprocess_data_task(x)forxinself._yield_sequences_from_files(preproc_config,semaphore))# Preprocess data and split results into train, test, and split.withself.preprocessing_context_manager(preproc_config.seedifpreproc_config.seedisnotNoneelseNone):forresult,elapsed_timeinpreproc_tasks:# Release semaphore for the task associated with the result.semaphore.release()# If we still need to ensure splits are assignedifsplits_needed:# Force assign to a needed splitsplit=splits_needed.pop()else:# Regular random assignmentsplit=self._train_val_test_split(preproc_config.train_split,preproc_config.valid_split,preproc_config.test_split)forsequenceinresult:sequence["split"]=splityieldsequence,elapsed_timedefpreprocess_offline(self,preproc_config:Evo2PreprocessingConfig):"""Offline data preprocessing script for Evo2. Args: preproc_config (Evo2PreprocessingConfig): Configuration object containing preprocessing settings. """# Validate if binaries have already been produced for the given config and overwrite is set to False.ifany(self._get_output_filename(preproc_config,ext,split).is_file()forext,splitinzip([self.BIN,self.IDX],[self.TRAIN,self.VAL,self.TEST])):ifnotpreproc_config.overwrite:# Skip this dataset!logging.info(f"Skipped overwriting (overwrite: False) existing preprocessed data: {preproc_config.output_prefix}")returnelse:logging.info(f"Overwriting (overwrite: True) existing preprocessed data: {preproc_config.output_prefix}")# Instantiate indexed data builders.dataset_dtype=getattr(np,preproc_config.indexed_dataset_dtype)temp_train_bin=self._get_output_filename(preproc_config,self.BIN,self.TRAIN,temp=True)temp_val_bin=self._get_output_filename(preproc_config,self.BIN,self.VAL,temp=True)temp_test_bin=self._get_output_filename(preproc_config,self.BIN,self.TEST,temp=True)train_builder:IndexedDatasetBuilder=IndexedDatasetBuilder(bin_path=str(temp_train_bin),dtype=dataset_dtype)val_builder:IndexedDatasetBuilder=IndexedDatasetBuilder(bin_path=str(temp_val_bin),dtype=dataset_dtype)test_builder:IndexedDatasetBuilder=IndexedDatasetBuilder(bin_path=str(temp_test_bin),dtype=dataset_dtype)logging.info(f"Created temporary binary datasets: {temp_train_bin}{temp_val_bin}{temp_test_bin}")# Preprocess data and split results into train, validation, or test.avg_preproc_time=0.0avg_index_time=0.0count=0forsequence,elapsed_timeinself.preprocess_generator(preproc_config):index_start_time=time.time()ifsequence["split"]=="train":train_builder.add_item(torch.Tensor(sequence["tokens"]))train_builder.end_document()elifsequence["split"]=="val":val_builder.add_item(torch.Tensor(sequence["tokens"]))val_builder.end_document()elifsequence["split"]=="test":test_builder.add_item(torch.Tensor(sequence["tokens"]))test_builder.end_document()index_end_time=time.time()# Update average preprocessing and indexing time.avg_preproc_time=(avg_preproc_time*count+elapsed_time)/(count+1)avg_index_time=(avg_index_time*count+index_end_time-index_start_time)/(count+1)count+=1# Report timing.logging.info(f"Average preprocessing time per sequence: {avg_preproc_time}")logging.info(f"Average indexing time per sequence: {avg_index_time}")logging.info(f"Number of sequences processed: {count}")# Write preprocessed index data to disk. Rename temporary binaries to denote preprocessing completion.train_builder.finalize(idx_path=str(self._get_output_filename(preproc_config,self.IDX,self.TRAIN)))val_builder.finalize(idx_path=str(self._get_output_filename(preproc_config,self.IDX,self.VAL)))test_builder.finalize(idx_path=str(self._get_output_filename(preproc_config,self.IDX,self.TEST)))os.rename(temp_train_bin,self._get_output_filename(preproc_config,self.BIN,self.TRAIN))os.rename(temp_val_bin,self._get_output_filename(preproc_config,self.BIN,self.VAL))os.rename(temp_test_bin,self._get_output_filename(preproc_config,self.BIN,self.TEST))
defpreprocess_data(self,filepath:str,seqid:str,seq:str,seq_idx:int,config:Evo2PreprocessingConfig):"""Preprocess fasta datapaths. Args: filepath (str): Path to the .fasta file. seqid (str): Sequence ID. seq (str): DNA sequence. seq_idx (int): Sequence index. config (Evo2PreprocessingConfig): Configuration object containing preprocessing settings. Returns: tuple[list[dict], float]: Preprocessed data and the time taken for preprocessing. """# Timing.start=time.time()# Retrieve taxonomy lineage string if SeqID has associated taxonomy data.# Note: Better implemented as a suffix tree substring dictionary, but convenient# for identifying a large amount of sequences with identical lineages.# Slow for extremely large dictionaries of (SeqID Substr, Taxonomy) pairs.lineage=Noneforid,taxinconfig.taxonomy_data.items():# Taxonomy ID is a substring of Seq ID.ifidinseqid:lineage=taxbreak# Preprocess data.preproc_data=[]withself.preprocessing_context_manager(config.seed+hash(filepath)+seq_idxifconfig.seedisnotNoneelseNone):# Randomly reverse complement the sequence.seq=self._random_reverse_complement(seq,prob=config.random_reverse_complement)seqs_to_parse=self._reverse_complement_expansion(seq)ifconfig.embed_reverse_complementelse[seq]forseqinseqs_to_parse:# Sequence Modifiersifconfig.force_uppercase:seq=seq.upper()ifconfig.transcribe=="transcribe":seq=transcribe_sequence(seq)elifconfig.transcribe=="back_transcribe":seq=back_transcribe_sequence(seq)ifconfig.drop_empty_sequencesandlen(seq)==0:continueifconfig.nnn_filterand"NNN"inseq.upper():continue# Construct taxonomy token with random dropout on the lineage categories per sequence.taxonomy_token=self._construct_taxonomy_token(lineage,dropout=config.random_lineage_dropout)# Inject taxonomy lineage tokens every prompt_spacer_length tokens in the sequence.# If the taxonomy lineage token is not provided, then just take the original sequence.target_length=(config.prompt_spacer_length-len(taxonomy_token)iftaxonomy_tokenisnotNoneelseNone)taxonomy_injected_sequence=[taxonomy_token+str(subseq)iftaxonomy_tokenisnotNoneelsestr(subseq)forsubseqinself._subsequence_generator(seq,target_length,target_length)]# Wrap and tokenize.preproc_data_record={"text":"".join(taxonomy_injected_sequence),}preproc_data_record["tokens"]=self.tokenizer.tokenize(preproc_data_record["text"],use_ftfy=config.ftfy,enforce_sample_length=config.enforce_sample_length,append_eod=config.append_eod,drop_empty_sequences=config.drop_empty_sequences,)preproc_data.append(preproc_data_record)end=time.time()returnpreproc_data,end-start
preprocess_data_task(file_sequence_config)
Wrapper function to unpack args for preprocess_data.
Parameters:
Name
Type
Description
Default
file_sequence_config
tuple
Tuple containing arguments for preprocess_data.
required
Returns:
Type
Description
tuple[list[dict], float]: Preprocessed data and the time taken for preprocessing.
Source code in bionemo/evo2/data/preprocess.py
296297298299300301302303304305
defpreprocess_data_task(self,file_sequence_config):"""Wrapper function to unpack args for preprocess_data. Args: file_sequence_config (tuple): Tuple containing arguments for preprocess_data. Returns: tuple[list[dict], float]: Preprocessed data and the time taken for preprocessing. """returnself.preprocess_data(*file_sequence_config)
defpreprocess_generator(self,preproc_config:Evo2PreprocessingConfig):"""Main function to preprocess data for Evo2. Args: preproc_config (Evo2PreprocessingConfig): Configuration object containing preprocessing settings. Yields: tuple[dict, float]: Preprocessed sequence data and the time taken for preprocessing. """# Track which splits have been assignedsplit_assignments={"train":preproc_config.train_split>0,"val":preproc_config.valid_split>0,"test":preproc_config.test_split>0,}splits_needed={kfork,vinsplit_assignments.items()ifv}# Instantiate multiprocessing pool. Use semaphore to limit the amount of sequences to read into memory.semaphore=Semaphore(preproc_config.preproc_concurrency+preproc_config.workers)ifpreproc_config.workers>1:pool=mp.Pool(preproc_config.workers)# Ordered imap for downstream seeded splitting.preproc_tasks=pool.imap(self.preprocess_data_task,self._yield_sequences_from_files(preproc_config,semaphore),chunksize=preproc_config.chunksize,)else:preproc_tasks=(self.preprocess_data_task(x)forxinself._yield_sequences_from_files(preproc_config,semaphore))# Preprocess data and split results into train, test, and split.withself.preprocessing_context_manager(preproc_config.seedifpreproc_config.seedisnotNoneelseNone):forresult,elapsed_timeinpreproc_tasks:# Release semaphore for the task associated with the result.semaphore.release()# If we still need to ensure splits are assignedifsplits_needed:# Force assign to a needed splitsplit=splits_needed.pop()else:# Regular random assignmentsplit=self._train_val_test_split(preproc_config.train_split,preproc_config.valid_split,preproc_config.test_split)forsequenceinresult:sequence["split"]=splityieldsequence,elapsed_time
defpreprocess_offline(self,preproc_config:Evo2PreprocessingConfig):"""Offline data preprocessing script for Evo2. Args: preproc_config (Evo2PreprocessingConfig): Configuration object containing preprocessing settings. """# Validate if binaries have already been produced for the given config and overwrite is set to False.ifany(self._get_output_filename(preproc_config,ext,split).is_file()forext,splitinzip([self.BIN,self.IDX],[self.TRAIN,self.VAL,self.TEST])):ifnotpreproc_config.overwrite:# Skip this dataset!logging.info(f"Skipped overwriting (overwrite: False) existing preprocessed data: {preproc_config.output_prefix}")returnelse:logging.info(f"Overwriting (overwrite: True) existing preprocessed data: {preproc_config.output_prefix}")# Instantiate indexed data builders.dataset_dtype=getattr(np,preproc_config.indexed_dataset_dtype)temp_train_bin=self._get_output_filename(preproc_config,self.BIN,self.TRAIN,temp=True)temp_val_bin=self._get_output_filename(preproc_config,self.BIN,self.VAL,temp=True)temp_test_bin=self._get_output_filename(preproc_config,self.BIN,self.TEST,temp=True)train_builder:IndexedDatasetBuilder=IndexedDatasetBuilder(bin_path=str(temp_train_bin),dtype=dataset_dtype)val_builder:IndexedDatasetBuilder=IndexedDatasetBuilder(bin_path=str(temp_val_bin),dtype=dataset_dtype)test_builder:IndexedDatasetBuilder=IndexedDatasetBuilder(bin_path=str(temp_test_bin),dtype=dataset_dtype)logging.info(f"Created temporary binary datasets: {temp_train_bin}{temp_val_bin}{temp_test_bin}")# Preprocess data and split results into train, validation, or test.avg_preproc_time=0.0avg_index_time=0.0count=0forsequence,elapsed_timeinself.preprocess_generator(preproc_config):index_start_time=time.time()ifsequence["split"]=="train":train_builder.add_item(torch.Tensor(sequence["tokens"]))train_builder.end_document()elifsequence["split"]=="val":val_builder.add_item(torch.Tensor(sequence["tokens"]))val_builder.end_document()elifsequence["split"]=="test":test_builder.add_item(torch.Tensor(sequence["tokens"]))test_builder.end_document()index_end_time=time.time()# Update average preprocessing and indexing time.avg_preproc_time=(avg_preproc_time*count+elapsed_time)/(count+1)avg_index_time=(avg_index_time*count+index_end_time-index_start_time)/(count+1)count+=1# Report timing.logging.info(f"Average preprocessing time per sequence: {avg_preproc_time}")logging.info(f"Average indexing time per sequence: {avg_index_time}")logging.info(f"Number of sequences processed: {count}")# Write preprocessed index data to disk. Rename temporary binaries to denote preprocessing completion.train_builder.finalize(idx_path=str(self._get_output_filename(preproc_config,self.IDX,self.TRAIN)))val_builder.finalize(idx_path=str(self._get_output_filename(preproc_config,self.IDX,self.VAL)))test_builder.finalize(idx_path=str(self._get_output_filename(preproc_config,self.IDX,self.TEST)))os.rename(temp_train_bin,self._get_output_filename(preproc_config,self.BIN,self.TRAIN))os.rename(temp_val_bin,self._get_output_filename(preproc_config,self.BIN,self.VAL))os.rename(temp_test_bin,self._get_output_filename(preproc_config,self.BIN,self.TEST))
Context manager for setting and restoring the random number generator state.
Parameters:
Name
Type
Description
Default
seed
int | None
Seed for the random number generator. Defaults to None.
None
Source code in bionemo/evo2/data/preprocess.py
6465666768697071727374757677787980
@staticmethod@contextmanagerdefpreprocessing_context_manager(seed:Optional[int]=None):"""Context manager for setting and restoring the random number generator state. Args: seed (int | None): Seed for the random number generator. Defaults to None. """# Track current state.current_state=random.getstate()try:# Set random seed.random.seed(seed)yieldseedfinally:# Restore random state.random.setstate(current_state)
main()
Main function to execute the preprocessing script.
This function parses command-line arguments, reads the configuration file,
and initiates the preprocessing of data as specified in the configuration.
defmain():"""Main function to execute the preprocessing script. This function parses command-line arguments, reads the configuration file, and initiates the preprocessing of data as specified in the configuration. """# Parse arguments.args=parse_args()# Read config YAML.withopen(args.config,"r")asyaml_fs:evo2_preproc_config_batch=yaml.safe_load(yaml_fs)forconfiginevo2_preproc_config_batch:start=time.time()# Convert into Evo2PreprocessingConfig.evo2_preproc_config=Evo2PreprocessingConfig(**config)ifevo2_preproc_config.output_dirisnotNone:evo2_preproc_config.output_dir.mkdir(parents=True,exist_ok=True)# Instantiate Evo2Preprocessor.evo2_preprocessor=Evo2Preprocessor(evo2_preproc_config)# Preprocess data specified in config.evo2_preprocessor.preprocess_offline(evo2_preproc_config)end=time.time()logging.info(f"Finished preprocessing {evo2_preproc_config.output_prefix} ({evo2_preproc_config.datapaths}) in {end-start:.3f} seconds with {evo2_preproc_config.workers} workers.")
parse_args()
Parse arguments for preprocessing.
Source code in bionemo/evo2/data/preprocess.py
450451452453454
defparse_args():"""Parse arguments for preprocessing."""parser=argparse.ArgumentParser(description="Preprocess FASTA files for training Evo2.")parser.add_argument("-c","--config",type=str,required=True,help="Path to data preprocessing config JSON.")returnparser.parse_args()