CharacterModel Object Model
From Phyloinformatics
Currently different programmers are coming up with their own model definition languages for character substitution models. This is anticipated to cause problems for users in the future, impeding ready exchange and rapid validation of models across the array of programs that apply such models.
Here we develop an interface definition language that encapsulates current (and future) substitution models. Our planned long-term efforts are:
- Add wrappers for Cipres and Hyphy after the meeting
- Creating own instance of an IDL for describing statistical models
- Create and test an XML format that applications can exchange (e.g. HyPhy, PAUP, MrBayes, BEAST, CIPRES, Mesquite, Garli)
typedef long UID; // used for fields that identify unique objects (and is hashable) typedef long ParamID; typedef long ExprID; module CipresIDL_api2 { /* Describes the specific category of an alphabet or which general category it falls under. */ enum DatatypeEnum { DNA_DATATYPE, //ACGT RNA_DATATYPE, //ACGU AA_DATATYPE, CODON_DATATYPE, // AAA, AAC, AAG, AAT, ... CATEGORICAL_DATATYPE, // the generic code for discrete characters ("Standard" in NEXUS) CONTINUOUS_DATATYPE, DISTANCE_DATATYPE, // used for taxon to taxon distances (distances block in NEXUS) RESTRICTION_FRAGMENT }; typedef sequence<long> AmbiguityCode; struct DatatypeTransformationRule { sequence< AmbiguityCode > mapping; // -1 means "drop this state" }; struct Datatype { DatatypeEnum datatypeEnumVar; UID id; /* unique identifier */ sequence<string> stateNames; /* state labels (or "0", "1"... if the type is generic)*/ sequence<DatatypeTransformationRule> transformationRules; }; enum ParamScopeEnum { GLOBAL, SUBSET, EDGE, EDGE_SUBSET }; enum ParamValueEnum { MARGINALIZED, FIXED_VALUE, TO_ESTIMATE, ESTIMATED }; enum BoundTypeEnum { NONE, OPEN, CLOSED }; struct Bound { BoundTypeEnum type; double value; }; struct Parameter { ParamID id; string name; ParamScopeEnum scope; ParamValueEnum valueMeaning; double value; Bound lower; Bound upper; }; enum OpCodeEnum { PARAMETER_VALUE, NEGATE, ADDITION, DIVISION, MULTIPLICATION, SUBTRACTION }; struct ExpressionTerm { OpCodeEnum opCode; UID paramRef; // id of the parameter }; struct Expression { ExprID id; sequence<ExpressionTerm> expr; // reverse polish notation }; struct ProbabilityDensity { sequence<double> densityParams; }; struct Prior { ProbabilityDensity dist; sequence<ParamID> parameters; }; enum ParamConstraintEnum { SUM_EQUALS_VALUE, MEAN_EQUALS_VALUE }; struct ParameterConstraint { ParamConstraintEnum paramConstraintEnumVar; double value; sequence<ParamID> parameters; }; enum ModelDescriptionEnum { Q_MATRIX, //all expressions are in the QMatrix field of the DiscreteCharacterModel SYMM_COMPONENT_MATRIX, // MIXTURE }; struct DiscreteCharacterModel; typedef sequence<DiscreteCharacterModel> DiscreteCharacterModelSeq; struct DiscreteCharacterModel { UID datatypeReference; /* Describes the type of data that the model can be applied to. a datatype object can be one of the common types or a description of constraints (e.g. 4-state) */ boolean isReversible; sequence<Expression> expressions; sequence<Parameter> parameters; ModelDescriptionEnum descriptionType; /** used if descriptionType == Q_MATRIX */ sequence< sequence<ExprID> > qMatrix; /** used if descriptionType == SYMM_COMPONENT_MATRIX */ sequence< sequence<ExprID> > symmComponentOfQMatrix; sequence<ExprID> equilStateFreq; sequence<ParameterConstraint> constraints; sequence<Prior> priors; /** used if descriptionType == MIXTURE */ DiscreteCharacterModelSeq subModels; sequence<ParamID> mixtureProportions; Prior mixtureProportionPrior; ExprID rateMultiplier; }; };