diff --git a/CLAUDE.md b/CLAUDE.md index f57c05b5..4d54eb77 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -35,7 +35,9 @@ src/communitymech/ │ └── reference_validator.py # Validates evidence items in YAML files └── cli.py # Entry point (not yet implemented) -kb/communities/ # 60 curated community YAML files +kb/communities/ # curated community YAML files (root class MicrobialCommunity) +kb/taxa/ # reusable per-taxon gene records (root class CommonTaxon); + # referenced from taxonomy[].common_taxon; `just validate-taxa` conf/oak_config.yaml # OAK ontology adapter config (NCBITaxon, ENVO, CHEBI, GO) references_cache/ # Cached PubMed abstracts (committed for reproducibility) scripts/ # Utility scripts for curation (not part of package) diff --git a/conf/id_label_targets.yaml b/conf/id_label_targets.yaml index 4496165f..3237b444 100644 --- a/conf/id_label_targets.yaml +++ b/conf/id_label_targets.yaml @@ -92,6 +92,18 @@ targets: - { id: "NCBITaxon:1807132", label: "Candidatus Phormidium alkaliphilum", reason: "absent from current OAK ncbitaxon snapshot and kg-microbe ncbitaxon TSV" } - { id: "NCBITaxon:3050471", label: "Stenotrophomonas goyi", reason: "absent from current OAK ncbitaxon snapshot and kg-microbe ncbitaxon TSV" } + # data inputs (reusable per-taxon gene records) — taxon_term.term (NCBITaxon) + # and gene go_terms (GO) must be canonical. genome {id,label} are NCBI Assembly + # accessions (no CURIE prefix → benign SKIPPED_NO_ADAPTER) and the record's own + # CommunityMech:taxon:* id is an ignored prefix, so only the ontology terms are + # checked here. + - name: taxa_yaml + kind: yaml + glob: "kb/taxa/*.yaml" + policy: canonical + pairs: + - [id, label] + # data product: KGX node export carries (id, name) — canonical OR synonym - name: kgx_nodes kind: tabular diff --git a/justfile b/justfile index 995956ff..c7656632 100644 --- a/justfile +++ b/justfile @@ -23,6 +23,20 @@ validate-all: uv run linkml-validate -s src/communitymech/schema/communitymech.yaml "$file" done +# Validate the reusable per-taxon gene records (kb/taxa/) against CommonTaxon. +# These files have CommonTaxon as their root, not MicrobialCommunity, so the +# target class must be given explicitly. +validate-taxa: + #!/usr/bin/env bash + set -uo pipefail + rc=0 + for file in kb/taxa/*.yaml; do + echo "Validating $file..." + uv run linkml-validate -s src/communitymech/schema/communitymech.yaml \ + --target-class CommonTaxon "$file" || rc=1 + done + exit $rc + # Strict in-process validation in *closed* mode (rejects unknown fields). # Emits reports/instance_validation_failures.tsv and exits 1 on any ERROR. # Catches the same drift class that gave CultureMech 59k silent errors; diff --git a/kb/communities/Shewanella_Geobacter_Exoelectrogenic_Biofilm_Community.yaml b/kb/communities/Shewanella_Geobacter_Exoelectrogenic_Biofilm_Community.yaml index b2b85f03..2c15c173 100644 --- a/kb/communities/Shewanella_Geobacter_Exoelectrogenic_Biofilm_Community.yaml +++ b/kb/communities/Shewanella_Geobacter_Exoelectrogenic_Biofilm_Community.yaml @@ -51,6 +51,7 @@ taxonomy: label: Shewanella oneidensis notes: Exoelectrogenic proteobacterium in the defined anode biofilm community. abundance_level: ABUNDANT + common_taxon: CommunityMech:taxon:000001 functional_role: - SYNTROPHIC_PARTNER - CROSS_FEEDER @@ -73,6 +74,7 @@ taxonomy: notes: Exoelectrogenic Geobacter member that was detected in the planktonic phase of mixed-culture reactors as well as in the anode-associated community. abundance_level: ABUNDANT + common_taxon: CommunityMech:taxon:000002 functional_role: - SYNTROPHIC_PARTNER - CROSS_FEEDER diff --git a/kb/taxa/Geobacter_sulfurreducens.yaml b/kb/taxa/Geobacter_sulfurreducens.yaml new file mode 100644 index 00000000..b9c96670 --- /dev/null +++ b/kb/taxa/Geobacter_sulfurreducens.yaml @@ -0,0 +1,57 @@ +id: CommunityMech:taxon:000002 +taxon_term: + preferred_term: Geobacter sulfurreducens + term: + id: NCBITaxon:35554 + label: Geobacter sulfurreducens +genomes: +- id: GCF_000007985.2 + label: ASM798v2 + notes: RefSeq reference assembly for Geobacter sulfurreducens PCA. +genes: +- gene_id: KEGG:gsu:GSU1496 + gene_symbol: pilA + locus_tag: GSU1496 + product: Type IV pilin; structural subunit of conductive (electrically conductive) pili / e-pili + genome: GCF_000007985.2 + go_terms: + - id: GO:0009055 + label: electron transfer activity + supports_roles: + - SYNTROPHIC_PARTNER + supports_interaction: > + Conductive pili (e-pili) mediate long-range extracellular and direct + interspecies electron transfer (DIET) to partner methanogens and to + Fe(III) oxides/electrodes. +- gene_id: KEGG:gsu:GSU2504 + gene_symbol: omcS + locus_tag: GSU2504 + product: Outer-surface hexaheme c-type cytochrome associated with conductive pili + genome: GCF_000007985.2 + go_terms: + - id: GO:0009055 + label: electron transfer activity + supports_roles: + - SYNTROPHIC_PARTNER + supports_interaction: > + Cytochrome filament/decoration required for extracellular electron transfer + to Fe(III) oxides and for direct interspecies electron transfer to partners. +- gene_id: KEGG:gsu:GSU2076 + gene_symbol: omcZ + locus_tag: GSU2076 + product: Outer-surface octaheme c-type cytochrome + genome: GCF_000007985.2 + go_terms: + - id: GO:0009055 + label: electron transfer activity + supports_roles: + - SYNTROPHIC_PARTNER + supports_interaction: > + Cytochrome essential for high-density current production at electrodes + (extracellular electron transfer to anodes). +notes: > + Reusable taxon record capturing the conductive-pili and multiheme-cytochrome + genes that underpin G. sulfurreducens' electron-transfer / syntrophic-partner + role in DIET cocultures, Fe(III)-reduction, and bioanode communities. Gene/role + claims are literature-standard; per-interaction EvidenceItems can be added + following the community-record evidence protocol. diff --git a/kb/taxa/Shewanella_oneidensis_MR1.yaml b/kb/taxa/Shewanella_oneidensis_MR1.yaml new file mode 100644 index 00000000..7b3c299a --- /dev/null +++ b/kb/taxa/Shewanella_oneidensis_MR1.yaml @@ -0,0 +1,67 @@ +id: CommunityMech:taxon:000001 +taxon_term: + preferred_term: Shewanella oneidensis MR-1 + term: + id: NCBITaxon:211586 + label: Shewanella oneidensis MR-1 +genomes: +- id: GCF_000146165.2 + label: ASM14616v2 + notes: RefSeq reference assembly for Shewanella oneidensis MR-1. +genes: +- gene_id: KEGG:son:SO_1778 + gene_symbol: mtrC + locus_tag: SO_1778 + product: Outer-membrane decaheme c-type cytochrome (terminal extracellular electron-transfer reductase) + genome: GCF_000146165.2 + go_terms: + - id: GO:0009055 + label: electron transfer activity + supports_roles: + - SYNTROPHIC_PARTNER + supports_interaction: > + Terminal reductase of the Mtr extracellular electron transfer pathway; transfers + electrons to extracellular acceptors (e.g. Fe(III)/Mn(IV) oxides, electrodes), + underpinning direct interspecies / electrode electron transfer interactions. +- gene_id: KEGG:son:SO_1776 + gene_symbol: mtrB + locus_tag: SO_1776 + product: Outer-membrane beta-barrel that embeds the MtrA/MtrC cytochrome module + genome: GCF_000146165.2 + supports_roles: + - SYNTROPHIC_PARTNER + supports_interaction: > + Forms the outer-membrane conduit of the MtrCAB porin-cytochrome complex enabling + electron egress to extracellular acceptors. +- gene_id: KEGG:son:SO_1777 + gene_symbol: mtrA + locus_tag: SO_1777 + product: Periplasmic decaheme c-type cytochrome of the Mtr pathway + genome: GCF_000146165.2 + go_terms: + - id: GO:0009055 + label: electron transfer activity + supports_roles: + - SYNTROPHIC_PARTNER + supports_interaction: > + Conducts electrons across the outer membrane within the MtrCAB complex during + extracellular electron transfer. +- gene_id: KEGG:son:SO_4591 + gene_symbol: cymA + locus_tag: SO_4591 + product: Inner-membrane tetraheme c-type cytochrome (menaquinol oxidase) feeding the Mtr pathway + genome: GCF_000146165.2 + go_terms: + - id: GO:0009055 + label: electron transfer activity + supports_roles: + - SYNTROPHIC_PARTNER + supports_interaction: > + Branch point that delivers electrons from the menaquinone pool to periplasmic + and outer-membrane cytochromes for extracellular electron transfer. +notes: > + Reusable taxon record capturing the Mtr extracellular-electron-transfer pathway + genes that underpin S. oneidensis MR-1's electron-donor role in microbial fuel + cell, metal-reduction, and DIET-style communities. Gene/role claims are + literature-standard; per-interaction EvidenceItems can be added following the + community-record evidence protocol. diff --git a/src/communitymech/datamodel/communitymech.py b/src/communitymech/datamodel/communitymech.py index ff3c8b42..6c01fd6e 100644 --- a/src/communitymech/datamodel/communitymech.py +++ b/src/communitymech/datamodel/communitymech.py @@ -1,5 +1,5 @@ # Auto generated from communitymech.yaml by pythongen.py version: 0.0.1 -# Generation date: 2026-06-08T21:28:17 +# Generation date: 2026-06-17T20:06:08 # Schema: communitymech # # id: https://w3id.org/communitymech @@ -57,6 +57,10 @@ class MicrobialCommunityId(extended_str): pass +class CommonTaxonId(extended_str): + pass + + @dataclass(repr=False) class Term(YAMLRoot): """ @@ -388,6 +392,11 @@ class TaxonomicComposition(YAMLRoot): strain_designation: Optional[Union[dict, StrainDesignation]] = None abundance_level: Optional[Union[str, "AbundanceEnum"]] = None abundance_value: Optional[str] = None + absolute_abundance: Optional[float] = None + absolute_abundance_unit: Optional[str] = None + relative_abundance: Optional[float] = None + relative_abundance_unit: Optional[str] = None + common_taxon: Optional[str] = None functional_role: Optional[ Union[Union[str, "FunctionalRoleEnum"], list[Union[str, "FunctionalRoleEnum"]]] ] = empty_list() @@ -412,6 +421,25 @@ def __post_init__(self, *_: str, **kwargs: Any): if self.abundance_value is not None and not isinstance(self.abundance_value, str): self.abundance_value = str(self.abundance_value) + if self.absolute_abundance is not None and not isinstance(self.absolute_abundance, float): + self.absolute_abundance = float(self.absolute_abundance) + + if self.absolute_abundance_unit is not None and not isinstance( + self.absolute_abundance_unit, str + ): + self.absolute_abundance_unit = str(self.absolute_abundance_unit) + + if self.relative_abundance is not None and not isinstance(self.relative_abundance, float): + self.relative_abundance = float(self.relative_abundance) + + if self.relative_abundance_unit is not None and not isinstance( + self.relative_abundance_unit, str + ): + self.relative_abundance_unit = str(self.relative_abundance_unit) + + if self.common_taxon is not None and not isinstance(self.common_taxon, str): + self.common_taxon = str(self.common_taxon) + if not isinstance(self.functional_role, list): self.functional_role = ( [self.functional_role] if self.functional_role is not None else [] @@ -1281,6 +1309,183 @@ def __post_init__(self, *_: str, **kwargs: Any): super().__post_init__(**kwargs) +@dataclass(repr=False) +class CommonTaxon(YAMLRoot): + """ + A reusable taxon record: an NCBITaxon-grounded organism together with its reference genome(s) and the genes known + to support its community role(s) or specific ecological interactions. Maintained once and referenced by many + community records. + """ + + _inherited_slots: ClassVar[list[str]] = [] + + class_class_uri: ClassVar[URIRef] = COMMUNITYMECH["CommonTaxon"] + class_class_curie: ClassVar[str] = "communitymech:CommonTaxon" + class_name: ClassVar[str] = "CommonTaxon" + class_model_uri: ClassVar[URIRef] = COMMUNITYMECH.CommonTaxon + + id: Union[str, CommonTaxonId] = None + taxon_term: Union[dict, TaxonDescriptor] = None + genomes: Optional[Union[Union[dict, "GenomeRecord"], list[Union[dict, "GenomeRecord"]]]] = ( + empty_list() + ) + genes: Optional[Union[Union[dict, "GeneAnnotation"], list[Union[dict, "GeneAnnotation"]]]] = ( + empty_list() + ) + notes: Optional[str] = None + curation_history: Optional[ + Union[Union[dict, CurationEvent], list[Union[dict, CurationEvent]]] + ] = empty_list() + + def __post_init__(self, *_: str, **kwargs: Any): + if self._is_empty(self.id): + self.MissingRequiredField("id") + if not isinstance(self.id, CommonTaxonId): + self.id = CommonTaxonId(self.id) + + if self._is_empty(self.taxon_term): + self.MissingRequiredField("taxon_term") + if not isinstance(self.taxon_term, TaxonDescriptor): + self.taxon_term = TaxonDescriptor(**as_dict(self.taxon_term)) + + if not isinstance(self.genomes, list): + self.genomes = [self.genomes] if self.genomes is not None else [] + self.genomes = [ + v if isinstance(v, GenomeRecord) else GenomeRecord(**as_dict(v)) for v in self.genomes + ] + + if not isinstance(self.genes, list): + self.genes = [self.genes] if self.genes is not None else [] + self.genes = [ + v if isinstance(v, GeneAnnotation) else GeneAnnotation(**as_dict(v)) for v in self.genes + ] + + if self.notes is not None and not isinstance(self.notes, str): + self.notes = str(self.notes) + + if not isinstance(self.curation_history, list): + self.curation_history = ( + [self.curation_history] if self.curation_history is not None else [] + ) + self.curation_history = [ + v if isinstance(v, CurationEvent) else CurationEvent(**as_dict(v)) + for v in self.curation_history + ] + + super().__post_init__(**kwargs) + + +@dataclass(repr=False) +class GenomeRecord(YAMLRoot): + """ + A reference genome assembly for a taxon. + """ + + _inherited_slots: ClassVar[list[str]] = [] + + class_class_uri: ClassVar[URIRef] = COMMUNITYMECH["GenomeRecord"] + class_class_curie: ClassVar[str] = "communitymech:GenomeRecord" + class_name: ClassVar[str] = "GenomeRecord" + class_model_uri: ClassVar[URIRef] = COMMUNITYMECH.GenomeRecord + + id: str = None + label: Optional[str] = None + strain_designation: Optional[Union[dict, StrainDesignation]] = None + notes: Optional[str] = None + + def __post_init__(self, *_: str, **kwargs: Any): + if self._is_empty(self.id): + self.MissingRequiredField("id") + if not isinstance(self.id, str): + self.id = str(self.id) + + if self.label is not None and not isinstance(self.label, str): + self.label = str(self.label) + + if self.strain_designation is not None and not isinstance( + self.strain_designation, StrainDesignation + ): + self.strain_designation = StrainDesignation(**as_dict(self.strain_designation)) + + if self.notes is not None and not isinstance(self.notes, str): + self.notes = str(self.notes) + + super().__post_init__(**kwargs) + + +@dataclass(repr=False) +class GeneAnnotation(YAMLRoot): + """ + A gene that supports a taxon's community role or a specific ecological interaction, with standardized identifiers + and supporting evidence. + """ + + _inherited_slots: ClassVar[list[str]] = [] + + class_class_uri: ClassVar[URIRef] = COMMUNITYMECH["GeneAnnotation"] + class_class_curie: ClassVar[str] = "communitymech:GeneAnnotation" + class_name: ClassVar[str] = "GeneAnnotation" + class_model_uri: ClassVar[URIRef] = COMMUNITYMECH.GeneAnnotation + + gene_id: str = None + gene_symbol: Optional[str] = None + locus_tag: Optional[str] = None + product: Optional[str] = None + genome: Optional[str] = None + kegg_ortholog: Optional[str] = None + go_terms: Optional[Union[Union[dict, Term], list[Union[dict, Term]]]] = empty_list() + supports_roles: Optional[ + Union[Union[str, "FunctionalRoleEnum"], list[Union[str, "FunctionalRoleEnum"]]] + ] = empty_list() + supports_interaction: Optional[str] = None + evidence: Optional[Union[Union[dict, EvidenceItem], list[Union[dict, EvidenceItem]]]] = ( + empty_list() + ) + + def __post_init__(self, *_: str, **kwargs: Any): + if self._is_empty(self.gene_id): + self.MissingRequiredField("gene_id") + if not isinstance(self.gene_id, str): + self.gene_id = str(self.gene_id) + + if self.gene_symbol is not None and not isinstance(self.gene_symbol, str): + self.gene_symbol = str(self.gene_symbol) + + if self.locus_tag is not None and not isinstance(self.locus_tag, str): + self.locus_tag = str(self.locus_tag) + + if self.product is not None and not isinstance(self.product, str): + self.product = str(self.product) + + if self.genome is not None and not isinstance(self.genome, str): + self.genome = str(self.genome) + + if self.kegg_ortholog is not None and not isinstance(self.kegg_ortholog, str): + self.kegg_ortholog = str(self.kegg_ortholog) + + if not isinstance(self.go_terms, list): + self.go_terms = [self.go_terms] if self.go_terms is not None else [] + self.go_terms = [v if isinstance(v, Term) else Term(**as_dict(v)) for v in self.go_terms] + + if not isinstance(self.supports_roles, list): + self.supports_roles = [self.supports_roles] if self.supports_roles is not None else [] + self.supports_roles = [ + v if isinstance(v, FunctionalRoleEnum) else FunctionalRoleEnum(v) + for v in self.supports_roles + ] + + if self.supports_interaction is not None and not isinstance(self.supports_interaction, str): + self.supports_interaction = str(self.supports_interaction) + + if not isinstance(self.evidence, list): + self.evidence = [self.evidence] if self.evidence is not None else [] + self.evidence = [ + v if isinstance(v, EvidenceItem) else EvidenceItem(**as_dict(v)) for v in self.evidence + ] + + super().__post_init__(**kwargs) + + # Enumerations class EvidenceItemSupportEnum(EnumDefinitionImpl): """ @@ -2179,6 +2384,52 @@ class slots: range=Optional[str], ) +slots.taxonomicComposition__absolute_abundance = Slot( + uri=COMMUNITYMECH.absolute_abundance, + name="taxonomicComposition__absolute_abundance", + curie=COMMUNITYMECH.curie("absolute_abundance"), + model_uri=COMMUNITYMECH.taxonomicComposition__absolute_abundance, + domain=None, + range=Optional[float], +) + +slots.taxonomicComposition__absolute_abundance_unit = Slot( + uri=COMMUNITYMECH.absolute_abundance_unit, + name="taxonomicComposition__absolute_abundance_unit", + curie=COMMUNITYMECH.curie("absolute_abundance_unit"), + model_uri=COMMUNITYMECH.taxonomicComposition__absolute_abundance_unit, + domain=None, + range=Optional[str], +) + +slots.taxonomicComposition__relative_abundance = Slot( + uri=COMMUNITYMECH.relative_abundance, + name="taxonomicComposition__relative_abundance", + curie=COMMUNITYMECH.curie("relative_abundance"), + model_uri=COMMUNITYMECH.taxonomicComposition__relative_abundance, + domain=None, + range=Optional[float], +) + +slots.taxonomicComposition__relative_abundance_unit = Slot( + uri=COMMUNITYMECH.relative_abundance_unit, + name="taxonomicComposition__relative_abundance_unit", + curie=COMMUNITYMECH.curie("relative_abundance_unit"), + model_uri=COMMUNITYMECH.taxonomicComposition__relative_abundance_unit, + domain=None, + range=Optional[str], +) + +slots.taxonomicComposition__common_taxon = Slot( + uri=COMMUNITYMECH.common_taxon, + name="taxonomicComposition__common_taxon", + curie=COMMUNITYMECH.curie("common_taxon"), + model_uri=COMMUNITYMECH.taxonomicComposition__common_taxon, + domain=None, + range=Optional[str], + pattern=re.compile(r"^CommunityMech:taxon:\d{6}$"), +) + slots.taxonomicComposition__functional_role = Slot( uri=COMMUNITYMECH.functional_role, name="taxonomicComposition__functional_role", @@ -3244,3 +3495,187 @@ class slots: domain=None, range=Optional[Union[bool, Bool]], ) + +slots.commonTaxon__id = Slot( + uri=COMMUNITYMECH.id, + name="commonTaxon__id", + curie=COMMUNITYMECH.curie("id"), + model_uri=COMMUNITYMECH.commonTaxon__id, + domain=None, + range=URIRef, + pattern=re.compile(r"^CommunityMech:taxon:\d{6}$"), +) + +slots.commonTaxon__taxon_term = Slot( + uri=COMMUNITYMECH.taxon_term, + name="commonTaxon__taxon_term", + curie=COMMUNITYMECH.curie("taxon_term"), + model_uri=COMMUNITYMECH.commonTaxon__taxon_term, + domain=None, + range=Union[dict, TaxonDescriptor], +) + +slots.commonTaxon__genomes = Slot( + uri=COMMUNITYMECH.genomes, + name="commonTaxon__genomes", + curie=COMMUNITYMECH.curie("genomes"), + model_uri=COMMUNITYMECH.commonTaxon__genomes, + domain=None, + range=Optional[Union[Union[dict, GenomeRecord], list[Union[dict, GenomeRecord]]]], +) + +slots.commonTaxon__genes = Slot( + uri=COMMUNITYMECH.genes, + name="commonTaxon__genes", + curie=COMMUNITYMECH.curie("genes"), + model_uri=COMMUNITYMECH.commonTaxon__genes, + domain=None, + range=Optional[Union[Union[dict, GeneAnnotation], list[Union[dict, GeneAnnotation]]]], +) + +slots.commonTaxon__notes = Slot( + uri=COMMUNITYMECH.notes, + name="commonTaxon__notes", + curie=COMMUNITYMECH.curie("notes"), + model_uri=COMMUNITYMECH.commonTaxon__notes, + domain=None, + range=Optional[str], +) + +slots.commonTaxon__curation_history = Slot( + uri=COMMUNITYMECH.curation_history, + name="commonTaxon__curation_history", + curie=COMMUNITYMECH.curie("curation_history"), + model_uri=COMMUNITYMECH.commonTaxon__curation_history, + domain=None, + range=Optional[Union[Union[dict, CurationEvent], list[Union[dict, CurationEvent]]]], +) + +slots.genomeRecord__id = Slot( + uri=COMMUNITYMECH.id, + name="genomeRecord__id", + curie=COMMUNITYMECH.curie("id"), + model_uri=COMMUNITYMECH.genomeRecord__id, + domain=None, + range=str, + pattern=re.compile(r"^GC[AF]_[0-9]{9}\.[0-9]+$"), +) + +slots.genomeRecord__label = Slot( + uri=COMMUNITYMECH.label, + name="genomeRecord__label", + curie=COMMUNITYMECH.curie("label"), + model_uri=COMMUNITYMECH.genomeRecord__label, + domain=None, + range=Optional[str], +) + +slots.genomeRecord__strain_designation = Slot( + uri=COMMUNITYMECH.strain_designation, + name="genomeRecord__strain_designation", + curie=COMMUNITYMECH.curie("strain_designation"), + model_uri=COMMUNITYMECH.genomeRecord__strain_designation, + domain=None, + range=Optional[Union[dict, StrainDesignation]], +) + +slots.genomeRecord__notes = Slot( + uri=COMMUNITYMECH.notes, + name="genomeRecord__notes", + curie=COMMUNITYMECH.curie("notes"), + model_uri=COMMUNITYMECH.genomeRecord__notes, + domain=None, + range=Optional[str], +) + +slots.geneAnnotation__gene_id = Slot( + uri=COMMUNITYMECH.gene_id, + name="geneAnnotation__gene_id", + curie=COMMUNITYMECH.curie("gene_id"), + model_uri=COMMUNITYMECH.geneAnnotation__gene_id, + domain=None, + range=str, +) + +slots.geneAnnotation__gene_symbol = Slot( + uri=COMMUNITYMECH.gene_symbol, + name="geneAnnotation__gene_symbol", + curie=COMMUNITYMECH.curie("gene_symbol"), + model_uri=COMMUNITYMECH.geneAnnotation__gene_symbol, + domain=None, + range=Optional[str], +) + +slots.geneAnnotation__locus_tag = Slot( + uri=COMMUNITYMECH.locus_tag, + name="geneAnnotation__locus_tag", + curie=COMMUNITYMECH.curie("locus_tag"), + model_uri=COMMUNITYMECH.geneAnnotation__locus_tag, + domain=None, + range=Optional[str], +) + +slots.geneAnnotation__product = Slot( + uri=COMMUNITYMECH.product, + name="geneAnnotation__product", + curie=COMMUNITYMECH.curie("product"), + model_uri=COMMUNITYMECH.geneAnnotation__product, + domain=None, + range=Optional[str], +) + +slots.geneAnnotation__genome = Slot( + uri=COMMUNITYMECH.genome, + name="geneAnnotation__genome", + curie=COMMUNITYMECH.curie("genome"), + model_uri=COMMUNITYMECH.geneAnnotation__genome, + domain=None, + range=Optional[str], + pattern=re.compile(r"^GC[AF]_[0-9]{9}\.[0-9]+$"), +) + +slots.geneAnnotation__kegg_ortholog = Slot( + uri=COMMUNITYMECH.kegg_ortholog, + name="geneAnnotation__kegg_ortholog", + curie=COMMUNITYMECH.curie("kegg_ortholog"), + model_uri=COMMUNITYMECH.geneAnnotation__kegg_ortholog, + domain=None, + range=Optional[str], + pattern=re.compile(r"^K[0-9]{5}$"), +) + +slots.geneAnnotation__go_terms = Slot( + uri=COMMUNITYMECH.go_terms, + name="geneAnnotation__go_terms", + curie=COMMUNITYMECH.curie("go_terms"), + model_uri=COMMUNITYMECH.geneAnnotation__go_terms, + domain=None, + range=Optional[Union[Union[dict, Term], list[Union[dict, Term]]]], +) + +slots.geneAnnotation__supports_roles = Slot( + uri=COMMUNITYMECH.supports_roles, + name="geneAnnotation__supports_roles", + curie=COMMUNITYMECH.curie("supports_roles"), + model_uri=COMMUNITYMECH.geneAnnotation__supports_roles, + domain=None, + range=Optional[Union[Union[str, "FunctionalRoleEnum"], list[Union[str, "FunctionalRoleEnum"]]]], +) + +slots.geneAnnotation__supports_interaction = Slot( + uri=COMMUNITYMECH.supports_interaction, + name="geneAnnotation__supports_interaction", + curie=COMMUNITYMECH.curie("supports_interaction"), + model_uri=COMMUNITYMECH.geneAnnotation__supports_interaction, + domain=None, + range=Optional[str], +) + +slots.geneAnnotation__evidence = Slot( + uri=COMMUNITYMECH.evidence, + name="geneAnnotation__evidence", + curie=COMMUNITYMECH.curie("evidence"), + model_uri=COMMUNITYMECH.geneAnnotation__evidence, + domain=None, + range=Optional[Union[Union[dict, EvidenceItem], list[Union[dict, EvidenceItem]]]], +) diff --git a/src/communitymech/schema/communitymech.yaml b/src/communitymech/schema/communitymech.yaml index a5c36287..f3c68f9e 100644 --- a/src/communitymech/schema/communitymech.yaml +++ b/src/communitymech/schema/communitymech.yaml @@ -630,7 +630,43 @@ classes: description: Relative abundance category range: AbundanceEnum abundance_value: - description: Quantitative abundance (%, cell count, etc.) + description: >- + DEPRECATED free-text quantitative abundance. Prefer the typed + absolute_abundance / relative_abundance fields below; retained for + back-compatibility with records that used a single combined value. + absolute_abundance: + description: >- + Absolute abundance of this taxon (e.g. cell count, CFU/mL, read count, + gene copies). Optional and independent of relative_abundance; pair with + absolute_abundance_unit. + range: float + required: false + absolute_abundance_unit: + description: >- + Unit/basis for absolute_abundance (e.g. cells/mL, CFU/mL, reads, + copies/g). + required: false + relative_abundance: + description: >- + Relative abundance of this taxon within the community. Optional and + independent of absolute_abundance. Interpreted as a fraction in [0, 1] + unless relative_abundance_unit says otherwise (e.g. percent). + range: float + required: false + minimum_value: 0 + relative_abundance_unit: + description: >- + Unit/basis for relative_abundance (e.g. "fraction", "percent", + "16S rRNA reads", "metagenome relative abundance"). Defaults to a + fraction in [0, 1] when omitted. + required: false + common_taxon: + description: >- + Reference to a reusable CommonTaxon record (kb/taxa/) that holds the + genome and gene annotations for this organism, so the same curated gene + set can be shared across community records. + required: false + pattern: "^CommunityMech:taxon:\\d{6}$" functional_role: description: Role(s) in the community range: FunctionalRoleEnum @@ -1067,3 +1103,112 @@ classes: llm_assisted: range: boolean description: Whether LLM assistance was used. + + # -- Reusable per-taxon gene records ------------------------------------ + # CommonTaxon records live as standalone YAML files under kb/taxa/ and are + # referenced from MicrobialCommunity.taxonomy[].common_taxon, so a curated set + # of genome + gene annotations for an organism can be shared across community + # records instead of being duplicated in each one. Validate these files with + # ``just validate-taxa`` (linkml-validate --target-class CommonTaxon). + CommonTaxon: + description: >- + A reusable taxon record: an NCBITaxon-grounded organism together with its + reference genome(s) and the genes known to support its community role(s) or + specific ecological interactions. Maintained once and referenced by many + community records. + attributes: + id: + description: Unique identifier for this common-taxon record. + identifier: true + required: true + pattern: "^CommunityMech:taxon:\\d{6}$" + taxon_term: + description: NCBITaxon-grounded organism this record describes. + range: TaxonDescriptor + required: true + genomes: + description: Reference genome assembly(ies) for this taxon. + range: GenomeRecord + multivalued: true + inlined_as_list: true + genes: + description: >- + Genes known to support this taxon's community role(s) or specific + interactions, with standardized gene/genome ids and evidence. + range: GeneAnnotation + multivalued: true + inlined_as_list: true + notes: + description: Additional notes about this taxon record. + curation_history: + description: Append-only audit trail of curation actions on this record. + range: CurationEvent + multivalued: true + inlined_as_list: true + + GenomeRecord: + description: A reference genome assembly for a taxon. + attributes: + id: + description: >- + NCBI Assembly accession for the genome (RefSeq GCF_ or GenBank GCA_), + e.g. GCF_000005845.2. + required: true + pattern: "^GC[AF]_[0-9]{9}\\.[0-9]+$" + label: + description: Human-readable assembly name (e.g. "ASM584v2"). + strain_designation: + description: Strain this assembly corresponds to. + range: StrainDesignation + notes: + description: Notes about this assembly (assembly level, source, etc.). + + GeneAnnotation: + description: >- + A gene that supports a taxon's community role or a specific ecological + interaction, with standardized identifiers and supporting evidence. + attributes: + gene_id: + description: >- + Primary standardized gene/protein identifier, as a CURIE. Accepted + prefixes: NCBIGene (e.g. NCBIGene:948242), UniProtKB + (e.g. UniProtKB:P0A6F5), or KEGG gene (e.g. KEGG:eco:b3987). For genes + best identified per-assembly, give locus_tag; for the orthologous + function, give kegg_ortholog. + required: true + gene_symbol: + description: Gene symbol / short name (e.g. dsrA, nifH, mtrC). + locus_tag: + description: >- + Genome locus tag tying the gene to a specific assembly + (e.g. b3987, SO_1776). + product: + description: Gene product or functional description. + genome: + description: >- + NCBI Assembly accession (GCF_/GCA_) of the genome this gene is annotated + in; should match one of the parent CommonTaxon.genomes[].id. + pattern: "^GC[AF]_[0-9]{9}\\.[0-9]+$" + kegg_ortholog: + description: KEGG Orthology id for the gene's function (e.g. K00370). + pattern: "^K[0-9]{5}$" + go_terms: + description: >- + GO molecular-function / biological-process terms describing this gene's + activity (canonical id↔label, checked by the term gate). + range: Term + multivalued: true + inlined_as_list: true + supports_roles: + description: Community functional role(s) this gene supports. + range: FunctionalRoleEnum + multivalued: true + supports_interaction: + description: >- + Specific community interaction(s) this gene mediates, as a free-text + description (may name an EcologicalInteraction from a community record). + evidence: + description: Evidence linking this gene to the role/interaction. + range: EvidenceItem + multivalued: true + inlined_as_list: true diff --git a/tests/test_taxon_records.py b/tests/test_taxon_records.py new file mode 100644 index 00000000..63ec97a8 --- /dev/null +++ b/tests/test_taxon_records.py @@ -0,0 +1,93 @@ +"""Tests for the reusable CommonTaxon records (kb/taxa/) and the new +abundance / common_taxon fields on TaxonomicComposition.""" + +import subprocess +from pathlib import Path + +import yaml + +SCHEMA = "src/communitymech/schema/communitymech.yaml" +TAXA_DIR = Path("kb/taxa") + + +def _validate(target_class: str, path: Path) -> subprocess.CompletedProcess: + return subprocess.run( + ["uv", "run", "linkml-validate", "-s", SCHEMA, "--target-class", target_class, str(path)], + capture_output=True, + text=True, + ) + + +def test_taxa_records_validate_against_common_taxon(): + files = sorted(TAXA_DIR.glob("*.yaml")) + assert files, "no kb/taxa/*.yaml records found" + for f in files: + res = _validate("CommonTaxon", f) + assert ( + res.returncode == 0 + ), f"{f} failed CommonTaxon validation:\n{res.stdout}\n{res.stderr}" + + +def test_taxa_record_structure(): + rec = yaml.safe_load((TAXA_DIR / "Shewanella_oneidensis_MR1.yaml").read_text()) + assert rec["id"] == "CommunityMech:taxon:000001" + assert rec["taxon_term"]["term"]["id"] == "NCBITaxon:211586" + # genomes use NCBI Assembly accessions + assert rec["genomes"][0]["id"].startswith("GCF_") + # genes carry standardized ids + locus tags + GO function terms + genes = {g["gene_symbol"]: g for g in rec["genes"]} + assert "mtrC" in genes + assert genes["mtrC"]["gene_id"].startswith("KEGG:") + assert genes["mtrC"]["locus_tag"] == "SO_1778" + assert genes["mtrC"]["go_terms"][0]["id"] == "GO:0009055" + assert genes["mtrC"]["genome"] == rec["genomes"][0]["id"] + + +def test_schema_defines_new_fields(): + schema = yaml.safe_load(Path(SCHEMA).read_text()) + classes = schema["classes"] + tc = classes["TaxonomicComposition"]["attributes"] + for slot in ("absolute_abundance", "relative_abundance", "common_taxon"): + assert slot in tc, f"TaxonomicComposition missing {slot}" + # absolute and relative abundance are independent, optional, numeric + assert tc["absolute_abundance"]["range"] == "float" + assert tc["relative_abundance"]["range"] == "float" + assert tc["absolute_abundance"].get("required", False) is False + assert tc["relative_abundance"].get("required", False) is False + for cls in ("CommonTaxon", "GenomeRecord", "GeneAnnotation"): + assert cls in classes, f"schema missing class {cls}" + + +def test_community_links_common_taxon(): + """The demonstration community references the reusable taxon records.""" + comm = yaml.safe_load( + Path( + "kb/communities/Shewanella_Geobacter_Exoelectrogenic_Biofilm_Community.yaml" + ).read_text() + ) + refs = {t.get("common_taxon") for t in comm["taxonomy"] if t.get("common_taxon")} + assert {"CommunityMech:taxon:000001", "CommunityMech:taxon:000002"} <= refs + + +def test_taxoncomposition_accepts_separate_abundances(tmp_path): + """A TaxonomicComposition with both absolute and relative abundance validates.""" + community = { + "id": "CommunityMech:000999", + "name": "abundance field smoke test", + "taxonomy": [ + { + "taxon_term": { + "preferred_term": "Escherichia coli", + "term": {"id": "NCBITaxon:562", "label": "Escherichia coli"}, + }, + "absolute_abundance": 1.2e8, + "absolute_abundance_unit": "cells/mL", + "relative_abundance": 0.35, + "relative_abundance_unit": "fraction", + } + ], + } + f = tmp_path / "c.yaml" + f.write_text(yaml.safe_dump(community)) + res = _validate("MicrobialCommunity", f) + assert res.returncode == 0, f"abundance smoke test failed:\n{res.stdout}\n{res.stderr}"