Public:CharacterModelObjectModel

From Phyloinformatics
Jump to: navigation, search

Currently different programmers are coming up with their own model definition languages for character substitution models. This is anticipated to cause problems for users in the future, impeding ready exchange and rapid validation of models across the array of programs that apply such models.

Here we develop an interface definition language that encapsulates current (and future) substitution models. Our planned long-term efforts are:

  • Add wrappers for Cipres and Hyphy after the meeting
  • Creating own instance of an IDL for describing statistical models
  • Create and test an XML format that applications can exchange (e.g. HyPhy, PAUP, MrBayes, BEAST, CIPRES, Mesquite, Garli)
typedef long UID; // used for fields that identify unique objects (and is hashable)
typedef long ParamID; 
typedef long ExprID;

module CipresIDL_api2
{
/* Describes the specific category of an alphabet or which general category it 
	falls under.
*/
enum DatatypeEnum {
        DNA_DATATYPE, //ACGT
        RNA_DATATYPE, //ACGU
        AA_DATATYPE, 
        CODON_DATATYPE, // AAA, AAC, AAG, AAT, ...

        CATEGORICAL_DATATYPE, // the generic code for discrete characters ("Standard" in NEXUS) 
        CONTINUOUS_DATATYPE,
        DISTANCE_DATATYPE,   // used for taxon to taxon distances (distances block in NEXUS)
        RESTRICTION_FRAGMENT
    };

typedef sequence<long> AmbiguityCode;
struct DatatypeTransformationRule
	{
		sequence< AmbiguityCode > mapping; // -1 means "drop this state"
	};

struct Datatype
	{
		DatatypeEnum datatypeEnumVar;
		UID id; /*  unique identifier */
		sequence<string> stateNames;  /* state labels (or "0", "1"... if the type is generic)*/
		sequence<DatatypeTransformationRule> transformationRules;
	};

enum ParamScopeEnum
	{
		GLOBAL, SUBSET, EDGE, EDGE_SUBSET
	};

enum ParamValueEnum
	{
		MARGINALIZED, FIXED_VALUE, TO_ESTIMATE, ESTIMATED
	};

enum BoundTypeEnum
	{
		NONE, OPEN, CLOSED
	};
	
struct Bound
	{
		BoundTypeEnum   type;
		double			value;
	};
	
struct Parameter
	{
		ParamID id;
		string name;
		ParamScopeEnum scope;
		ParamValueEnum valueMeaning;

		double value;
		Bound  lower;
		Bound  upper; 
	};

enum OpCodeEnum 
	{
		PARAMETER_VALUE, NEGATE, ADDITION, DIVISION, MULTIPLICATION, SUBTRACTION
    };
    
struct ExpressionTerm
	{
		OpCodeEnum	opCode;
		UID			paramRef; // id of the parameter 
	};
	
struct Expression
	{
		ExprID	id;
		sequence<ExpressionTerm> expr; // reverse polish notation
	};

struct ProbabilityDensity
	{
		sequence<double> densityParams;
	};

struct Prior
	{
		ProbabilityDensity dist;
		sequence<ParamID> parameters;
	};

enum ParamConstraintEnum
	{
		SUM_EQUALS_VALUE, MEAN_EQUALS_VALUE
	};
	
struct ParameterConstraint
	{
		ParamConstraintEnum paramConstraintEnumVar;
		double value;
		sequence<ParamID> parameters;
	};

enum ModelDescriptionEnum
	{
		Q_MATRIX, //all expressions are in the QMatrix field of the DiscreteCharacterModel
		SYMM_COMPONENT_MATRIX, // 
		MIXTURE
	};

struct DiscreteCharacterModel;

typedef sequence<DiscreteCharacterModel> DiscreteCharacterModelSeq;

struct DiscreteCharacterModel
	{
		UID			datatypeReference; /* Describes the type of data that the model can be applied to.
											a datatype object can be one of the common types or a
											description of constraints (e.g. 4-state)  */
		boolean 	isReversible;
		
		sequence<Expression> expressions; 
		sequence<Parameter>	parameters;

		ModelDescriptionEnum descriptionType;
		

			/** used if descriptionType == Q_MATRIX */
		sequence< sequence<ExprID> > qMatrix; 

			/** used if descriptionType == SYMM_COMPONENT_MATRIX */
		sequence< sequence<ExprID> > 	symmComponentOfQMatrix; 
		sequence<ExprID>				equilStateFreq;

		sequence<ParameterConstraint> constraints;
		sequence<Prior>	priors;
		
			/** used if descriptionType == MIXTURE */
		DiscreteCharacterModelSeq subModels;
		sequence<ParamID>	mixtureProportions;
		Prior	mixtureProportionPrior;
		
		ExprID rateMultiplier;
		
	};
};