# pmmltreemodel2R v0.01.03: convert a PMML tree model to R code # Copyright (C) 2010 Shane Butler # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # # ABOUT # ===== # This program will load a PMML tree model and convert it into the R language. # It is not indended to support all PMML features (transforms, for example) # however it is hoped it will serve as a base for scoring with other languages, # and should be pretty easy to do so by changing the programming constructs, etc # that are generated. If you do improve this code to support your favourite # language, please consider supplying your changes back to help us improve the # project! # # It is stongly recommended that if you use code generated by this script that # you validate the results it generates against a file scored elsewhere. # # USAGE # ===== # This is a very early alpha release and as such is a little rough around the # edges! It should, however be fairly straight forward to use: # > source("pmmltreemodel2R.R") # > pmmltreemodel2R("weather_rpart.xml") # # Following this you will have the following global variables in your # environment (Sorry - R newbie and have not worked out best way to return # these!!): # - pmml: an XML object representing your PMML file # - src: your newly created R code (type cat(src) to see the formatted code) # - predname: the target variable name which would have the prediction set # after you run the code in src # # TODO # ==== # - detect class type and format accordingly (don't force as string) # - output according to relevant OutputFields, dont use from MiningSchema # - seperate parsing from language generation to allow easy extension to other # languages # - tests! create wrapper functions to show how to use. # - prepend a commented list of model vars to the output # # KNOWN ISSUES # ============ # This early alpha has not been widely tested and is not yet expected to run # across files from all PMML providers. Many features are still missing. # # CHANGELOG # ========= # 04 Feb 2010: v0.01.03 # - surrogate support (predicateHandler(), predicateChildren()) # - output class assignment using Node objects "score" attribute (treeNodeHandler()) # - output as many ScoreDistributions as are specified (treeNodeHandler()) # 02 Feb 2010: v0.01.02 # - Significant changes to treeNodeHandler() - fixes major bug in code gen # 01 Feb 2010: v0.01.01 # - Use %in% operator for isIn and isNotIn predicates # - Major surgery to predicateHandler(), most compound predicates work now # - Minor improvements in error reporting # - Consistent probability variable names as output_prob # - Fixed typos # 30th Jan 2010: v0.01.00 # - First release! miningSchemaHandler <- function(xmlObj) { ret <- vector() for (k in (xmlObj[names(xmlObj) == "MiningField"])) { if (xmlGetAttr(k, "usageType") == "predicted") { predname <<- xmlGetAttr(k, "name") } ret[length(ret)+1] <- xmlGetAttr(k, "usageType") } # TODO do something with ret, "supplementary" and "active" } lookupSimplePredicate <- function (field, strOp, value) { if (strOp == "equal") { return (paste(field, "==", value)) } else if (strOp == "notEqual") { return (paste(field, "!=", value)) } else if (strOp == "lessThan") { return (paste(field, "<", value)) } else if (strOp == "lessOrEqual") { return (paste(field, "<=", value)) } else if (strOp == "greaterThan") { return (paste(field, ">", value)) } else if (strOp == "greaterOrEqual") { return (paste(field, ">=", value)) } else if (strOp == "isMissing") { return (paste("is.na(", field, ")", sep="")) } else if (strOp == "isNotMissing") { return (paste("!is.na(", field, ")", sep="")) } else { stop ("Unknown simple predicate: ", strOp) } } lookupSimpleSetPredicate <- function (field, strOp, a) { # Convert the array to a comma delimited string if (xmlGetAttr(a, "type") == "string") { flat1 <- unlist(strsplit(xmlValue(a), "\"")) is_string <- TRUE } else { flat1 <- unlist(strsplit(xmlValue(a), " ")) is_string <- FALSE } flat2 <- unlist(lapply(flat1, function(x){ if(!is.null(x) & x !="" & x!=" ") return(x); })) if (is_string) { value <- paste("\"", paste (flat2, collapse="\", \""), "\"", sep="") } else { value <- paste (flat2, collapse=", ") } if (strOp == "isIn") { return (paste("any(", field, " %in% c(", value, "))", sep="")) } else if (strOp == "isNotIn") { return (paste("!any(", field, " %in% c(", value, "))", sep="")) } else { stop ("Unkown simple set predicate: ", strOp) } } predicateChildren <- function (comp_pred, surr_flag) { child_preds <- vector() countr <- 0 for (k in xmlChildren (comp_pred)) { l <- list (tmp_name=k) names (l) <- xmlName (k) countr <- countr + 1 child_preds[countr] <- predicateHandler (l, surr_flag) } return (child_preds) } predicateHandler <- function(kids, surr_flag) { if (!is.null(kids$True)) { return ("TRUE") } else if (!is.null(kids$False)) { return ("FALSE") } else if (!is.null(kids$SimplePredicate)) { field <- xmlGetAttr(kids$SimplePredicate, "field") operator <- xmlGetAttr(kids$SimplePredicate, "operator") val <- xmlGetAttr(kids$SimplePredicate, "value") if (surr_flag && operator != "isMissing" && operator != "isNotMissing") { return (paste("(!is.na(", field, ") && ", lookupSimplePredicate (field, operator, val), ")", sep="")) } else { return (lookupSimplePredicate (field, operator, val)) } } else if (!is.null(kids$SimpleSetPredicate)) { field <- xmlGetAttr(kids$SimpleSetPredicate, "field") operator <- xmlGetAttr(kids$SimpleSetPredicate, "booleanOperator") if (surr_flag) { return (paste("(!is.na(", field, ") && ", lookupSimpleSetPredicate(field, operator, xmlChildren(kids$SimpleSetPredicate)$Array), ")", sep="")) } else { return (lookupSimpleSetPredicate(field, operator, xmlChildren(kids$SimpleSetPredicate)$Array)) } } else if (!is.null(kids$CompoundPredicate)) { operator <- xmlGetAttr(kids$CompoundPredicate, "booleanOperator") if (operator == "or") { child_preds <- predicateChildren(kids$CompoundPredicate, surr_flag) return (paste("(", paste(child_preds, collapse=" || "), ")", sep="")) } else if (operator == "and") { child_preds <- predicateChildren(kids$CompoundPredicate, surr_flag) return (paste("(", paste(child_preds, collapse=" && "), ")", sep="")) } else if (operator == "xor") { child_preds <- predicateChildren(kids$CompoundPredicate, surr_flag) # R's xor() seems to take only 2 args so I have used modulus to check for even number of true booleans return (paste("((sum(c(", paste(child_preds, collapse=", "), ")) %% 2) == 1)", sep="")) } else if (operator == "surrogate") { child_preds <- predicateChildren(kids$CompoundPredicate, TRUE) return (paste("(", paste(child_preds, collapse=" || "), ")", sep="")) } else { warning ("Unknown CompoundPredicate type: ", operator) } } else { stop ("Unknown predicate type") } } treeNodeHandler <- function(xmlObj, depth) { pad_sz <- " " indent <- paste(rep(pad_sz,depth), collapse="") kids <- xmlChildren(xmlObj) scores_str <- "" nodeRecordCount <- xmlGetAttr(xmlObj, "recordCount") if (any(names(kids) == "ScoreDistribution")) { for (k in (kids[names(kids) == "ScoreDistribution"])) { scores_str <- paste(scores_str, indent, pad_sz, predname, "_", xmlGetAttr(k, "value"), " <- ", xmlGetAttr(k, "recordCount"), " / ", nodeRecordCount, "\n", sep="") } } klass <- xmlGetAttr (xmlObj, "score") if (!is.null(klass)) { klass_str <- paste(indent, pad_sz, predname, " <- \"", klass, "\"\n", sep="") #FIXME dont assume string } else { klass_str <- "" } # if there is a child node, then recurse if (any(names(kids) == "Node")) { tmp_nodes <- kids[names(kids) == "Node"] return (paste ("(", predicateHandler(kids, FALSE), ") {\n", klass_str, scores_str, indent, pad_sz, "if ", paste(lapply(tmp_nodes, treeNodeHandler, depth=depth+1), collapse=" else if "), "\n", indent, "}", sep="")) } else { return (paste ("(", predicateHandler(kids, FALSE), ") {\n", klass_str, scores_str, indent, "}", sep="")) } } treeModelHandler <- function(model) { miningschema <- xmlChildren(model)$MiningSchema miningSchemaHandler(miningschema) #output <- xmlChildren(model)$Output #outputHandler(output) root_node <- xmlChildren(model)$Node child_node <- xmlChildren(root_node) return (paste("if", treeNodeHandler(xmlChildren(model)$Node, 0), "\n")) } pmmltreemodel2R <- function(filename) { require (XML, quietly=TRUE) xmlobj <- xmlTreeParse (filename, useInternalNodes=TRUE) pmml <<- xmlChildren(xmlobj)$PMML # TODO handle other parts of the PMML file #if (any(names(pmml) == "DataDictionary")) { # dataDictHandler(xmlChildren(pmml)$DataDictionary) #} ver_pmml2code <- "0.01.03" #ver_treemodel <- "02" hdr <- paste ("# pmmltreemodel2R ", ver_pmml2code, "\n", "# WARNING: THIS IS ALPHA SOFTWARE, USE AT YOUR OWN RISK\n", "#\n", "# Input: PMML TreeModel\n", "# Output Language: R\n", "#\n", sep="") if (any(names(pmml) == "TreeModel")) { src <<- paste (hdr, paste (treeModelHandler(xmlChildren(pmml)$TreeModel)), sep="") } else { stop ("PMML model type not yet implemented!") } }