enpi_api.examples.apps.sequence_annotation

Sequence Annotation

Sequence Annotation offers a scalable and intuitive way to go from raw data to annotated clones. It annotates B and T cells from multiple species sequenced with any technology. It supports plate- and microfluidics-based protocols, linking paired chains using barcodes and UMI error correction to ensure data of the highest quality.

Example Sequence Annotation run configuration

Run Sequence Annotation on raw sequencing data.

from enpi_api.l2.client.enpi_api_client import EnpiApiClient
from enpi_api.l2.types.sequence_annotation import SequenceTemplateConfig, SequenceTemplateSelector

with EnpiApiClient() as enpi_client:
    # Let us assume the quality control template is called "Default Quality Template"
    # This requires you to have created a template with this name in the ENPICOM Platform
    quality_control_template = enpi_client.sequence_annotation_api.get_quality_control_template_by_name("Default Quality Template")

    # Let us assume we need only a single sequence template, and it is called "Simple IG"
    # This requires you to have created a template with this name in the ENPICOM Platform
    sequence_template = enpi_client.sequence_annotation_api.get_sequence_template_by_name("Simple IG")

    # Let us assume we are working with `Homo sapiens` data
    reference_database_revision = enpi_client.reference_database_api.get_revision_by_name("ENPICOM Platform (MiLaboratories)", "Homo sapiens")

    # Upload the raw sequencing data
    # You can also use files that are already uploaded, then you will have to search for them using the `file_api.get_files` method
    file = enpi_client.file_api.upload_file(file_path="path/to/file.fastq").wait()

    # Run Sequence Annotation
    collection_metadata = enpi_client.sequence_annotation_api.start(
        # Set the name of the resulting collection
        name="My collection",
        # Specify which file(s) to use
        file_ids=[file.id],
        # Specify the sequence template to use per file (it is possible to assign a different template to each file)
        sequence_templates=[SequenceTemplateConfig(selector=SequenceTemplateSelector(value=file.id), id=sequence_template.id)],
        # Specify the quality control template to use
        quality_control_template=quality_control_template,
        # Specify the reference database revision to use
        reference_database_revision=reference_database_revision,
        # Specify whether to archive the raw sequencing data after processing
        archive_inputs=True,
    ).wait()  # Since this starts a long-running operation, we must wait for it to finish if we want to use the result further down

    # Now our collection is guaranteed to be ready to use
    print(f"Collection {collection_metadata.name} has been created with ID {collection_metadata.id}")

Example Sequence Annotation run configuration with framework correction

Showcases how to run Sequence Annotation on raw sequencing data again, however this example focuses on the optional framework regions correction setting that can be passed to the Sequence Annotation API function.

from enpi_api.l2.client.enpi_api_client import EnpiApiClient
from enpi_api.l2.types.sequence_annotation import CorrectionRegion, CorrectionSettings, SequenceTemplateConfig, SequenceTemplateSelector

with EnpiApiClient() as enpi_client:
    # Let us assume the quality control template is called "Default Quality Template"
    # This requires you to have created a template with this name in the ENPICOM Platform
    quality_control_template = enpi_client.sequence_annotation_api.get_quality_control_template_by_name("Default Quality Template")

    # Let us assume we need only a single sequence template, and it is called "Simple IG"
    # This requires you to have created a template with this name in the ENPICOM Platform
    sequence_template = enpi_client.sequence_annotation_api.get_sequence_template_by_name("Simple IG")

    # Let us assume we are working with `Homo sapiens` data
    reference_database_revision = enpi_client.reference_database_api.get_revision_by_name("ENPICOM Platform (MiLaboratories)", "Homo sapiens")

    # Upload the raw sequencing data
    # You can also use files that are already uploaded, then you will have to search for them using the `file_api.get_files` method
    file = enpi_client.file_api.upload_file(file_path="path/to/file.fastq").wait()

    # Run Sequence Annotation
    collection_metadata = enpi_client.sequence_annotation_api.start(
        # Set the name of the resulting collection
        name="My collection",
        # Specify which file(s) to use
        file_ids=[file.id],
        # Specify the sequence template to use per file (it is possible to assign a different template to each file)
        sequence_templates=[SequenceTemplateConfig(selector=SequenceTemplateSelector(value=file.id), id=sequence_template.id)],
        # Specify the quality control template to use
        quality_control_template=quality_control_template,
        # Specify the reference database revision to use
        reference_database_revision=reference_database_revision,
        # Specify whether to archive the raw sequencing data after processing
        archive_inputs=True,
        # Correct the framework regions to germline and complete the ends of the sequences
        # to full VDJ (missing nucleotides are taken from the germline)
        correction_settings=CorrectionSettings(
            should_complete_ends=True,
            regions=[
                CorrectionRegion.FR1,
                CorrectionRegion.FR2,
                CorrectionRegion.FR3,
                CorrectionRegion.FR4,
            ],
        ),
    ).wait()  # Since this starts a long-running operation, we must wait for it to finish if we want to use the result further down

    # Now our collection is guaranteed to be ready to use
    print(f"Collection {collection_metadata.name} has been created with ID {collection_metadata.id}")

Example Sequence Annotation run configuration with manually set clone identifier

Showcases how to run Sequence Annotation on raw sequencing data again, however this example focuses on the optional use of manual clone identifiers that link sequences into clones. This flow requires the sequence data (e.g. FASTA files) to contain some values that can be used for identifying the clones, which can then be extracted from either the filenames or the FASTA headers and then be used in order to link sequences present in the files into clones.

from enpi_api.l2.client.enpi_api_client import EnpiApiClient
from enpi_api.l2.types.file import FileId
from enpi_api.l2.types.sequence_annotation import (
    CloneIdentifierExtractionConfig,
    CloneIdentifierExtractionSource,
    SequenceTemplateConfig,
    SequenceTemplateSelector,
)
from enpi_api.l2.types.tag import TagLevel

with EnpiApiClient() as enpi_client:
    """We are assuming that:
        1. You have created a sequence template referenced below for Sequence Annotation with the `Manually Specify Clone Identifier`
            option turned on (can be done only via web version of ENPICOM Platform)
        2. You have created a quality assurance template referenced below for Sequence Annotation (can be
            done only via web version of ENPICOM Platform)
        3. You have uploaded the FASTA files you want to perform Sequence Annotation on into the ENPICOM Platform already
        4. Your uploaded FASTA files follow this format (values can be changed, but structure must remain):
            >43370843|CloneId=Clone1|Heavy 1|Bmax=1.2
            ACGACGCTCTTCCGATCT...
    """

    # IDs of the fasta files you want to put into Sequence Annotation. Remember that for this example
    # the FASTA headers need to have a specific
    fasta_file_ids: list[FileId] = [
        # TODO: Insert FASTA file IDs here. You can browse your files with `enpi_client.file_api.get_files()`
    ]
    assert len(fasta_file_ids) > 0, """Please specify which files you want as input for Sequence Annotation"""

    # Get the reference for the collection
    reference_database_revision = enpi_client.reference_database_api.get_revision_by_name(
        name="ENPICOM Platform (MiLaboratories)",
        species="Homo sapiens",
    )

    # We are looking for a template with an example name `Template with manual clone identifier`.
    # The most important part of the sequence template configuration is the `Manually Specify Clone Identifier` option
    # set to `true` - make sure its set this way for your template
    required_sequence_template_name = "Template with manual clone identifier"

    # If there is no sequence template matching the required name, or if there are multiple ones
    # matching it, function will raise an error. In such case visit web version of ENPICOM Platform
    # and create, rename or delete templates as needed
    sequence_template = enpi_client.sequence_annotation_api.get_sequence_template_by_name(
        name=required_sequence_template_name,
    )

    # Now get the quality control template. There is no specific requirements about its config this time.
    # This function can raise errors in the same manner as `get_sequence_template_by_name`, which also can
    # be fixed by managing quality control templates in ENPICOM Platform
    required_quality_control_template_name = "Example quality control template"
    quality_control_template = enpi_client.sequence_annotation_api.get_quality_control_template_by_name(
        name=required_quality_control_template_name,
    )

    # Get a tag archetype that will serve as a clone identifier tag. This is where clone identifier values
    # will be stored per each clone. Make sure this tag archetype exists, otherwise an error will be raised
    clone_identifier_tag_archetype_key = "Clone Identifier"
    clone_identifier_tag_archetype = enpi_client.tag_api.get_tag_archetype_by_name(
        level=TagLevel.CLONE,
        key=clone_identifier_tag_archetype_key,
    )
    assert clone_identifier_tag_archetype is not None, f"""Tag archetype with key "{clone_identifier_tag_archetype_key}"
does not exist, please create it first. Use `TagLevel.CLONE` as its tag level and `TagDataType.TEXT` as its resource type as well"""

    # Get ID of the tag archetype
    clone_identifier_tag_archetype_id = clone_identifier_tag_archetype.id

    # The delimiter we are using to iterate over header parts. We split on "|" and "="
    # characters. The array below could be also writen as a single string "|/=", where
    # "/" character serves as a divider between two delimiters
    delimiter = ["|", "="]

    # Pick the third part of the splitted header string. We start indexing at 0, so first
    # part has index equal to 0, second equal to 1, third equal to 2
    index = 2

    enpi_client.sequence_annotation_api.start(
        name="Example Sequence Annotation run with manual clone identifier",
        file_ids=fasta_file_ids,
        reference_database_revision=reference_database_revision,  # Reference for the imported collection
        sequence_templates=[  # Specify which sequence template should be used for each of the files.
            # You could pick different templates for different files, but using the same one for all
            # of them is fine as well - and that is what we will do
            SequenceTemplateConfig(
                selector=SequenceTemplateSelector(
                    value=file_id,
                ),
                id=sequence_template.id,
            )
            for file_id in fasta_file_ids
        ],
        quality_control_template=quality_control_template,
        clone_id_extraction=CloneIdentifierExtractionConfig(
            source=CloneIdentifierExtractionSource.HEADER,  # Specify the source
            # for the clone identifier - in our case in the FASTA header
            delimiter=delimiter,
            index=index,
            target_tag_id=clone_identifier_tag_archetype_id,
        ),
        archive_inputs=False,  # An optional flag that can archive the input FASTA files after Sequence Annotation is done
    ).wait()

View Source

  1'''
  2# Sequence Annotation
  3
  4Sequence Annotation offers a scalable and intuitive way to go from raw data to annotated clones. It annotates B and T cells from multiple species sequenced with any technology.
  5It supports plate- and microfluidics-based protocols, linking paired chains using barcodes and UMI error correction to ensure data of the highest quality.
  6
  7##Example Sequence Annotation run configuration
  8
  9Run Sequence Annotation on raw sequencing data.
 10```python
 11from enpi_api.l2.client.enpi_api_client import EnpiApiClient
 12from enpi_api.l2.types.sequence_annotation import SequenceTemplateConfig, SequenceTemplateSelector
 13
 14with EnpiApiClient() as enpi_client:
 15    # Let us assume the quality control template is called "Default Quality Template"
 16    # This requires you to have created a template with this name in the ENPICOM Platform
 17    quality_control_template = enpi_client.sequence_annotation_api.get_quality_control_template_by_name("Default Quality Template")
 18
 19    # Let us assume we need only a single sequence template, and it is called "Simple IG"
 20    # This requires you to have created a template with this name in the ENPICOM Platform
 21    sequence_template = enpi_client.sequence_annotation_api.get_sequence_template_by_name("Simple IG")
 22
 23    # Let us assume we are working with `Homo sapiens` data
 24    reference_database_revision = enpi_client.reference_database_api.get_revision_by_name("ENPICOM Platform (MiLaboratories)", "Homo sapiens")
 25
 26    # Upload the raw sequencing data
 27    # You can also use files that are already uploaded, then you will have to search for them using the `file_api.get_files` method
 28    file = enpi_client.file_api.upload_file(file_path="path/to/file.fastq").wait()
 29
 30    # Run Sequence Annotation
 31    collection_metadata = enpi_client.sequence_annotation_api.start(
 32        # Set the name of the resulting collection
 33        name="My collection",
 34        # Specify which file(s) to use
 35        file_ids=[file.id],
 36        # Specify the sequence template to use per file (it is possible to assign a different template to each file)
 37        sequence_templates=[SequenceTemplateConfig(selector=SequenceTemplateSelector(value=file.id), id=sequence_template.id)],
 38        # Specify the quality control template to use
 39        quality_control_template=quality_control_template,
 40        # Specify the reference database revision to use
 41        reference_database_revision=reference_database_revision,
 42        # Specify whether to archive the raw sequencing data after processing
 43        archive_inputs=True,
 44    ).wait()  # Since this starts a long-running operation, we must wait for it to finish if we want to use the result further down
 45
 46    # Now our collection is guaranteed to be ready to use
 47    print(f"Collection {collection_metadata.name} has been created with ID {collection_metadata.id}")
 48
 49```
 50##Example Sequence Annotation run configuration with framework correction
 51
 52Showcases how to run Sequence Annotation on raw sequencing data again, however this example focuses on the optional framework regions correction
 53setting that can be passed to the Sequence Annotation API function.
 54```python
 55from enpi_api.l2.client.enpi_api_client import EnpiApiClient
 56from enpi_api.l2.types.sequence_annotation import CorrectionRegion, CorrectionSettings, SequenceTemplateConfig, SequenceTemplateSelector
 57
 58with EnpiApiClient() as enpi_client:
 59    # Let us assume the quality control template is called "Default Quality Template"
 60    # This requires you to have created a template with this name in the ENPICOM Platform
 61    quality_control_template = enpi_client.sequence_annotation_api.get_quality_control_template_by_name("Default Quality Template")
 62
 63    # Let us assume we need only a single sequence template, and it is called "Simple IG"
 64    # This requires you to have created a template with this name in the ENPICOM Platform
 65    sequence_template = enpi_client.sequence_annotation_api.get_sequence_template_by_name("Simple IG")
 66
 67    # Let us assume we are working with `Homo sapiens` data
 68    reference_database_revision = enpi_client.reference_database_api.get_revision_by_name("ENPICOM Platform (MiLaboratories)", "Homo sapiens")
 69
 70    # Upload the raw sequencing data
 71    # You can also use files that are already uploaded, then you will have to search for them using the `file_api.get_files` method
 72    file = enpi_client.file_api.upload_file(file_path="path/to/file.fastq").wait()
 73
 74    # Run Sequence Annotation
 75    collection_metadata = enpi_client.sequence_annotation_api.start(
 76        # Set the name of the resulting collection
 77        name="My collection",
 78        # Specify which file(s) to use
 79        file_ids=[file.id],
 80        # Specify the sequence template to use per file (it is possible to assign a different template to each file)
 81        sequence_templates=[SequenceTemplateConfig(selector=SequenceTemplateSelector(value=file.id), id=sequence_template.id)],
 82        # Specify the quality control template to use
 83        quality_control_template=quality_control_template,
 84        # Specify the reference database revision to use
 85        reference_database_revision=reference_database_revision,
 86        # Specify whether to archive the raw sequencing data after processing
 87        archive_inputs=True,
 88        # Correct the framework regions to germline and complete the ends of the sequences
 89        # to full VDJ (missing nucleotides are taken from the germline)
 90        correction_settings=CorrectionSettings(
 91            should_complete_ends=True,
 92            regions=[
 93                CorrectionRegion.FR1,
 94                CorrectionRegion.FR2,
 95                CorrectionRegion.FR3,
 96                CorrectionRegion.FR4,
 97            ],
 98        ),
 99    ).wait()  # Since this starts a long-running operation, we must wait for it to finish if we want to use the result further down
100
101    # Now our collection is guaranteed to be ready to use
102    print(f"Collection {collection_metadata.name} has been created with ID {collection_metadata.id}")
103
104```
105##Example Sequence Annotation run configuration with manually set clone identifier
106
107Showcases how to run Sequence Annotation on raw sequencing data again, however this example focuses on the optional use of manual clone identifiers that link
108sequences into clones. This flow requires the sequence data (e.g. FASTA files) to contain some values that can be used
109for identifying the clones, which can then be extracted from either the filenames or the FASTA headers and then be used in order
110to link sequences present in the files into clones.
111```python
112from enpi_api.l2.client.enpi_api_client import EnpiApiClient
113from enpi_api.l2.types.file import FileId
114from enpi_api.l2.types.sequence_annotation import (
115    CloneIdentifierExtractionConfig,
116    CloneIdentifierExtractionSource,
117    SequenceTemplateConfig,
118    SequenceTemplateSelector,
119)
120from enpi_api.l2.types.tag import TagLevel
121
122with EnpiApiClient() as enpi_client:
123    """We are assuming that:
124        1. You have created a sequence template referenced below for Sequence Annotation with the `Manually Specify Clone Identifier`
125            option turned on (can be done only via web version of ENPICOM Platform)
126        2. You have created a quality assurance template referenced below for Sequence Annotation (can be
127            done only via web version of ENPICOM Platform)
128        3. You have uploaded the FASTA files you want to perform Sequence Annotation on into the ENPICOM Platform already
129        4. Your uploaded FASTA files follow this format (values can be changed, but structure must remain):
130            >43370843|CloneId=Clone1|Heavy 1|Bmax=1.2
131            ACGACGCTCTTCCGATCT...
132    """
133
134    # IDs of the fasta files you want to put into Sequence Annotation. Remember that for this example
135    # the FASTA headers need to have a specific
136    fasta_file_ids: list[FileId] = [
137        # TODO: Insert FASTA file IDs here. You can browse your files with `enpi_client.file_api.get_files()`
138    ]
139    assert len(fasta_file_ids) > 0, """Please specify which files you want as input for Sequence Annotation"""
140
141    # Get the reference for the collection
142    reference_database_revision = enpi_client.reference_database_api.get_revision_by_name(
143        name="ENPICOM Platform (MiLaboratories)",
144        species="Homo sapiens",
145    )
146
147    # We are looking for a template with an example name `Template with manual clone identifier`.
148    # The most important part of the sequence template configuration is the `Manually Specify Clone Identifier` option
149    # set to `true` - make sure its set this way for your template
150    required_sequence_template_name = "Template with manual clone identifier"
151
152    # If there is no sequence template matching the required name, or if there are multiple ones
153    # matching it, function will raise an error. In such case visit web version of ENPICOM Platform
154    # and create, rename or delete templates as needed
155    sequence_template = enpi_client.sequence_annotation_api.get_sequence_template_by_name(
156        name=required_sequence_template_name,
157    )
158
159    # Now get the quality control template. There is no specific requirements about its config this time.
160    # This function can raise errors in the same manner as `get_sequence_template_by_name`, which also can
161    # be fixed by managing quality control templates in ENPICOM Platform
162    required_quality_control_template_name = "Example quality control template"
163    quality_control_template = enpi_client.sequence_annotation_api.get_quality_control_template_by_name(
164        name=required_quality_control_template_name,
165    )
166
167    # Get a tag archetype that will serve as a clone identifier tag. This is where clone identifier values
168    # will be stored per each clone. Make sure this tag archetype exists, otherwise an error will be raised
169    clone_identifier_tag_archetype_key = "Clone Identifier"
170    clone_identifier_tag_archetype = enpi_client.tag_api.get_tag_archetype_by_name(
171        level=TagLevel.CLONE,
172        key=clone_identifier_tag_archetype_key,
173    )
174    assert clone_identifier_tag_archetype is not None, f"""Tag archetype with key "{clone_identifier_tag_archetype_key}"
175does not exist, please create it first. Use `TagLevel.CLONE` as its tag level and `TagDataType.TEXT` as its resource type as well"""
176
177    # Get ID of the tag archetype
178    clone_identifier_tag_archetype_id = clone_identifier_tag_archetype.id
179
180    # The delimiter we are using to iterate over header parts. We split on "|" and "="
181    # characters. The array below could be also writen as a single string "|/=", where
182    # "/" character serves as a divider between two delimiters
183    delimiter = ["|", "="]
184
185    # Pick the third part of the splitted header string. We start indexing at 0, so first
186    # part has index equal to 0, second equal to 1, third equal to 2
187    index = 2
188
189    enpi_client.sequence_annotation_api.start(
190        name="Example Sequence Annotation run with manual clone identifier",
191        file_ids=fasta_file_ids,
192        reference_database_revision=reference_database_revision,  # Reference for the imported collection
193        sequence_templates=[  # Specify which sequence template should be used for each of the files.
194            # You could pick different templates for different files, but using the same one for all
195            # of them is fine as well - and that is what we will do
196            SequenceTemplateConfig(
197                selector=SequenceTemplateSelector(
198                    value=file_id,
199                ),
200                id=sequence_template.id,
201            )
202            for file_id in fasta_file_ids
203        ],
204        quality_control_template=quality_control_template,
205        clone_id_extraction=CloneIdentifierExtractionConfig(
206            source=CloneIdentifierExtractionSource.HEADER,  # Specify the source
207            # for the clone identifier - in our case in the FASTA header
208            delimiter=delimiter,
209            index=index,
210            target_tag_id=clone_identifier_tag_archetype_id,
211        ),
212        archive_inputs=False,  # An optional flag that can archive the input FASTA files after Sequence Annotation is done
213    ).wait()
214
215```
216'''