001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.oozie.coord; 020 021import java.net.URI; 022import java.net.URISyntaxException; 023import java.util.regex.Pattern; 024 025import org.apache.hadoop.conf.Configuration; 026import org.apache.oozie.DagELFunctions; 027import org.apache.oozie.client.WorkflowJob; 028import org.apache.oozie.dependency.URIHandler; 029import org.apache.oozie.service.ConfigurationService; 030import org.apache.oozie.service.Services; 031import org.apache.oozie.service.URIHandlerService; 032import org.apache.oozie.util.ELEvaluator; 033import org.apache.oozie.util.HCatURI; 034import org.apache.oozie.util.HCatURIParser; 035import org.apache.oozie.util.XLog; 036 037/** 038 * This class implements the EL function for HCat datasets in coordinator 039 */ 040 041public class HCatELFunctions { 042 private static final Configuration EMPTY_CONF = new Configuration(true); 043 private static final String HCAT_URI_REGEX_CONFIG = ConfigurationService.get("oozie.hcat.uri.regex.pattern"); 044 private static final Pattern HCAT_URI_PATTERN = Pattern.compile(HCAT_URI_REGEX_CONFIG); 045 046 enum EventType { 047 input, output 048 } 049 050 /* Workflow Parameterization EL functions */ 051 052 /** 053 * Return true if partitions exists or false if not. 054 * 055 * @param uri hcatalog partition uri. 056 * @return <code>true</code> if the uri exists, <code>false</code> if it does not. 057 * @throws Exception 058 */ 059 public static boolean hcat_exists(String uri) throws Exception { 060 URI hcatURI = new URI(uri); 061 URIHandlerService uriService = Services.get().get(URIHandlerService.class); 062 URIHandler handler = uriService.getURIHandler(hcatURI); 063 WorkflowJob workflow = DagELFunctions.getWorkflow(); 064 String user = workflow.getUser(); 065 return handler.exists(hcatURI, EMPTY_CONF, user); 066 } 067 068 /* Coord EL functions */ 069 070 /** 071 * Echo the same EL function without evaluating anything 072 * 073 * @param dataInName 074 * @return the same EL function 075 */ 076 public static String ph1_coord_databaseIn_echo(String dataInName) { 077 // Checking if the dataIn is correct? 078 isValidDataEvent(dataInName); 079 return echoUnResolved("databaseIn", "'" + dataInName + "'"); 080 } 081 082 public static String ph1_coord_databaseOut_echo(String dataName) { 083 // Checking if the dataOut is correct? 084 isValidDataEvent(dataName); 085 return echoUnResolved("databaseOut", "'" + dataName + "'"); 086 } 087 088 public static String ph1_coord_tableIn_echo(String dataName) { 089 // Checking if the dataIn is correct? 090 isValidDataEvent(dataName); 091 return echoUnResolved("tableIn", "'" + dataName + "'"); 092 } 093 094 public static String ph1_coord_tableOut_echo(String dataName) { 095 // Checking if the dataOut is correct? 096 isValidDataEvent(dataName); 097 return echoUnResolved("tableOut", "'" + dataName + "'"); 098 } 099 100 public static String ph1_coord_dataInPartitionFilter_echo(String dataInName, String type) { 101 // Checking if the dataIn/dataOut is correct? 102 isValidDataEvent(dataInName); 103 return echoUnResolved("dataInPartitionFilter", "'" + dataInName + "', '" + type + "'"); 104 } 105 106 public static String ph1_coord_dataInPartitionMin_echo(String dataInName, String partition) { 107 // Checking if the dataIn/dataOut is correct? 108 isValidDataEvent(dataInName); 109 return echoUnResolved("dataInPartitionMin", "'" + dataInName + "', '" + partition + "'"); 110 } 111 112 public static String ph1_coord_dataInPartitionMax_echo(String dataInName, String partition) { 113 // Checking if the dataIn/dataOut is correct? 114 isValidDataEvent(dataInName); 115 return echoUnResolved("dataInPartitionMax", "'" + dataInName + "', '" + partition + "'"); 116 } 117 118 public static String ph1_coord_dataOutPartitions_echo(String dataOutName) { 119 // Checking if the dataIn/dataOut is correct? 120 isValidDataEvent(dataOutName); 121 return echoUnResolved("dataOutPartitions", "'" + dataOutName + "'"); 122 } 123 124 public static String ph1_coord_dataInPartitions_echo(String dataInName, String type) { 125 // Checking if the dataIn/dataOut is correct? 126 isValidDataEvent(dataInName); 127 return echoUnResolved("dataInPartitions", "'" + dataInName + "', '" + type + "'"); 128 } 129 130 public static String ph1_coord_dataOutPartitionValue_echo(String dataOutName, String partition) { 131 // Checking if the dataIn/dataOut is correct? 132 isValidDataEvent(dataOutName); 133 return echoUnResolved("dataOutPartitionValue", "'" + dataOutName + "', '" + partition + "'"); 134 } 135 136 /** 137 * Extract the hcat DB name from the URI-template associate with 138 * 'dataInName'. Caller needs to specify the EL-evaluator level variable 139 * 'oozie.coord.el.dataset.bean' with synchronous dataset object 140 * (SyncCoordDataset) 141 * 142 * @param dataInName 143 * @return DB name 144 */ 145 public static String ph3_coord_databaseIn(String dataInName) { 146 HCatURI hcatURI = getURIFromResolved(dataInName, EventType.input); 147 if (hcatURI != null) { 148 return hcatURI.getDb(); 149 } 150 else { 151 return ""; 152 } 153 } 154 155 /** 156 * Extract the hcat DB name from the URI-template associate with 157 * 'dataOutName'. Caller needs to specify the EL-evaluator level variable 158 * 'oozie.coord.el.dataset.bean' with synchronous dataset object 159 * (SyncCoordDataset) 160 * 161 * @param dataOutName 162 * @return DB name 163 */ 164 public static String ph3_coord_databaseOut(String dataOutName) { 165 HCatURI hcatURI = getURIFromResolved(dataOutName, EventType.output); 166 if (hcatURI != null) { 167 return hcatURI.getDb(); 168 } 169 else { 170 return ""; 171 } 172 } 173 174 /** 175 * Extract the hcat Table name from the URI-template associate with 176 * 'dataInName'. Caller needs to specify the EL-evaluator level variable 177 * 'oozie.coord.el.dataset.bean' with synchronous dataset object 178 * (SyncCoordDataset) 179 * 180 * @param dataInName 181 * @return Table name 182 */ 183 public static String ph3_coord_tableIn(String dataInName) { 184 HCatURI hcatURI = getURIFromResolved(dataInName, EventType.input); 185 if (hcatURI != null) { 186 return hcatURI.getTable(); 187 } 188 else { 189 return ""; 190 } 191 } 192 193 /** 194 * Extract the hcat Table name from the URI-template associate with 195 * 'dataOutName'. Caller needs to specify the EL-evaluator level variable 196 * 'oozie.coord.el.dataset.bean' with synchronous dataset object 197 * (SyncCoordDataset) 198 * 199 * @param dataOutName 200 * @return Table name 201 */ 202 public static String ph3_coord_tableOut(String dataOutName) { 203 HCatURI hcatURI = getURIFromResolved(dataOutName, EventType.output); 204 if (hcatURI != null) { 205 return hcatURI.getTable(); 206 } 207 else { 208 return ""; 209 } 210 } 211 212 /** 213 * Used to specify the HCat partition filter which is input dependency for workflow job.<p> Look for two evaluator-level 214 * variables <p> A) .datain.<DATAIN_NAME> B) .datain.<DATAIN_NAME>.unresolved <p> A defines the current list of 215 * HCat URIs. <p> B defines whether there are any unresolved EL-function (i.e latest) <p> If there are something 216 * unresolved, this function will echo back the original function <p> otherwise it sends the partition filter. 217 * 218 * @param dataInName : Datain name 219 * @param type : for action type - pig, MR or hive 220 */ 221 public static String ph3_coord_dataInPartitionFilter(String dataInName, String type) { 222 ELEvaluator eval = ELEvaluator.getCurrent(); 223 String uris = (String) eval.getVariable(".datain." + dataInName); 224 Boolean unresolved = (Boolean) eval.getVariable(".datain." + dataInName + ".unresolved"); 225 if (unresolved != null && unresolved.booleanValue() == true) { 226 return "${coord:dataInPartitionFilter('" + dataInName + "', '" + type + "')}"; 227 } 228 return createPartitionFilter(uris, type); 229 } 230 231 /** 232 * Used to specify the HCat partition's value defining output for workflow job.<p> Look for two evaluator-level 233 * variables <p> A) .dataout.<DATAOUT_NAME> B) .dataout.<DATAOUT_NAME>.unresolved <p> A defines the current list of 234 * HCat URIs. <p> B defines whether there are any unresolved EL-function (i.e latest) <p> If there are something 235 * unresolved, this function will echo back the original function <p> otherwise it sends the partition value. 236 * 237 * @param dataOutName : Dataout name 238 * @param partitionName : Specific partition name whose value is wanted 239 */ 240 public static String ph3_coord_dataOutPartitionValue(String dataOutName, String partitionName) { 241 ELEvaluator eval = ELEvaluator.getCurrent(); 242 String uri = (String) eval.getVariable(".dataout." + dataOutName); 243 Boolean unresolved = (Boolean) eval.getVariable(".dataout." + dataOutName + ".unresolved"); 244 if (unresolved != null && unresolved.booleanValue() == true) { 245 return "${coord:dataOutPartitionValue('" + dataOutName + "', '" + partitionName + "')}"; 246 } 247 try { 248 HCatURI hcatUri = new HCatURI(uri); 249 return hcatUri.getPartitionValue(partitionName); 250 } 251 catch(URISyntaxException urie) { 252 XLog.getLog(HCatELFunctions.class).warn("Exception with uriTemplate [{0}]. Reason [{1}]: ", uri, urie); 253 throw new RuntimeException("HCat URI can't be parsed " + urie); 254 } 255 } 256 257 /** 258 * Used to specify the entire HCat partition defining output for workflow job.<p> Look for two evaluator-level 259 * variables <p> A) .dataout.<DATAOUT_NAME> B) .dataout.<DATAOUT_NAME>.unresolved <p> A defines the data-out 260 * HCat URI. <p> B defines whether there are any unresolved EL-function (i.e latest) <p> If there are something 261 * unresolved, this function will echo back the original function <p> otherwise it sends the partition. 262 * 263 * @param dataOutName : DataOut name 264 */ 265 public static String ph3_coord_dataOutPartitions(String dataOutName) { 266 ELEvaluator eval = ELEvaluator.getCurrent(); 267 String uri = (String) eval.getVariable(".dataout." + dataOutName); 268 Boolean unresolved = (Boolean) eval.getVariable(".dataout." + dataOutName + ".unresolved"); 269 if (unresolved != null && unresolved.booleanValue() == true) { 270 return "${coord:dataOutPartitions('" + dataOutName + "')}"; 271 } 272 try { 273 return new HCatURI(uri).toPartitionString(); 274 } 275 catch (URISyntaxException e) { 276 throw new RuntimeException("Parsing exception for HCatURI " + uri + ". details: " + e); 277 } 278 } 279 280 /** 281 * Used to specify the entire HCat partition defining input for workflow job. <p> Look for two evaluator-level 282 * variables <p> A) .datain.<DATAIN_NAME> B) .datain.<DATAIN_NAME>.unresolved <p> A defines the data-in HCat URI. 283 * <p> B defines whether there are any unresolved EL-function (i.e latest) <p> If there are something unresolved, 284 * this function will echo back the original function <p> otherwise it sends the partition. 285 * 286 * @param dataInName : DataIn name 287 * @param type : for action type: hive-export 288 */ 289 public static String ph3_coord_dataInPartitions(String dataInName, String type) { 290 ELEvaluator eval = ELEvaluator.getCurrent(); 291 String uri = (String) eval.getVariable(".datain." + dataInName); 292 Boolean unresolved = (Boolean) eval.getVariable(".datain." + dataInName + ".unresolved"); 293 if (unresolved != null && unresolved.booleanValue() == true) { 294 return "${coord:dataInPartitions('" + dataInName + "', '" + type + "')}"; 295 } 296 String partitionValue = null; 297 if (uri != null) { 298 if (type.equals("hive-export")) { 299 String[] uriList = HCatURIParser.splitHCatUris(uri, HCAT_URI_PATTERN); 300 if (uriList.length > 1) { 301 throw new RuntimeException("Multiple partitions not supported for hive-export type. Dataset name: " 302 + dataInName + " URI: " + uri); 303 } 304 try { 305 partitionValue = new HCatURI(uri).toPartitionValueString(type); 306 } 307 catch (URISyntaxException e) { 308 throw new RuntimeException("Parsing exception for HCatURI " + uri, e); 309 } 310 } else { 311 throw new RuntimeException("Unsupported type: " + type + " dataset name: " + dataInName); 312 } 313 } 314 else { 315 XLog.getLog(HCatELFunctions.class).warn("URI is null"); 316 return null; 317 } 318 return partitionValue; 319 } 320 321 /** 322 * Used to specify the MAXIMUM value of an HCat partition which is input dependency for workflow job. 323 * <p> Look for two evaluator-level 324 * variables <p> A) .datain.<DATAIN_NAME> B) .datain.<DATAIN_NAME>.unresolved <p> A defines the current list of 325 * HCat URIs. <p> B defines whether there are any unresolved EL-function (i.e latest) <p> If there are something 326 * unresolved, this function will echo back the original function <p> otherwise it sends the max partition value. 327 * 328 * @param dataInName : Datain name 329 * @param partitionName : Specific partition name whose MAX value is wanted 330 */ 331 public static String ph3_coord_dataInPartitionMin(String dataInName, String partitionName) { 332 ELEvaluator eval = ELEvaluator.getCurrent(); 333 String uris = (String) eval.getVariable(".datain." + dataInName); 334 Boolean unresolved = (Boolean) eval.getVariable(".datain." + dataInName + ".unresolved"); 335 if (unresolved != null && unresolved.booleanValue() == true) { 336 return "${coord:dataInPartitionMin('" + dataInName + "', '" + partitionName + "')}"; 337 } 338 String minPartition = null; 339 if (uris != null) { 340 String[] uriList = HCatURIParser.splitHCatUris(uris, HCAT_URI_PATTERN); 341 // get the partition values list and find minimum 342 try { 343 // initialize minValue with first partition value 344 minPartition = new HCatURI(uriList[0]).getPartitionValue(partitionName); 345 if (minPartition == null || minPartition.isEmpty()) { 346 throw new RuntimeException("No value in data-in uri for partition key: " + partitionName); 347 } 348 for (int i = 1; i < uriList.length; i++) { 349 String value = new HCatURI(uriList[i]).getPartitionValue(partitionName); 350 if(value.compareTo(minPartition) < 0) { //sticking to string comparison since some numerical date 351 //values can also contain letters e.g. 20120101T0300Z (UTC) 352 minPartition = value; 353 } 354 } 355 } 356 catch(URISyntaxException urie) { 357 throw new RuntimeException("HCat URI can't be parsed " + urie); 358 } 359 } 360 else { 361 XLog.getLog(HCatELFunctions.class).warn("URI is null"); 362 return null; 363 } 364 return minPartition; 365 } 366 367 /** 368 * Used to specify the MINIMUM value of an HCat partition which is input dependency for workflow job. 369 * <p> Look for two evaluator-level 370 * variables <p> A) .datain.<DATAIN_NAME> B) .datain.<DATAIN_NAME>.unresolved <p> A defines the current list of 371 * HCat URIs. <p> B defines whether there are any unresolved EL-function (i.e latest) <p> If there are something 372 * unresolved, this function will echo back the original function <p> otherwise it sends the min partition value. 373 * 374 * @param dataInName : Datain name 375 * @param partitionName : Specific partition name whose MIN value is wanted 376 */ 377 public static String ph3_coord_dataInPartitionMax(String dataInName, String partitionName) { 378 ELEvaluator eval = ELEvaluator.getCurrent(); 379 String uris = (String) eval.getVariable(".datain." + dataInName); 380 Boolean unresolved = (Boolean) eval.getVariable(".datain." + dataInName + ".unresolved"); 381 if (unresolved != null && unresolved.booleanValue() == true) { 382 return "${coord:dataInPartitionMin('" + dataInName + "', '" + partitionName + "')}"; 383 } 384 String maxPartition = null; 385 if (uris != null) { 386 String[] uriList = HCatURIParser.splitHCatUris(uris, HCAT_URI_PATTERN); 387 // get the partition values list and find minimum 388 try { 389 // initialize minValue with first partition value 390 maxPartition = new HCatURI(uriList[0]).getPartitionValue(partitionName); 391 if (maxPartition == null || maxPartition.isEmpty()) { 392 throw new RuntimeException("No value in data-in uri for partition key: " + partitionName); 393 } 394 for(int i = 1; i < uriList.length; i++) { 395 String value = new HCatURI(uriList[i]).getPartitionValue(partitionName); 396 if(value.compareTo(maxPartition) > 0) { 397 maxPartition = value; 398 } 399 } 400 } 401 catch(URISyntaxException urie) { 402 throw new RuntimeException("HCat URI can't be parsed " + urie); 403 } 404 } 405 else { 406 XLog.getLog(HCatELFunctions.class).warn("URI is null"); 407 return null; 408 } 409 return maxPartition; 410 } 411 412 private static String createPartitionFilter(String uris, String type) { 413 String[] uriList = HCatURIParser.splitHCatUris(uris, HCAT_URI_PATTERN); 414 StringBuilder filter = new StringBuilder(""); 415 if (uriList.length > 0) { 416 for (String uri : uriList) { 417 if (filter.length() > 0) { 418 filter.append(" OR "); 419 } 420 try { 421 filter.append(new HCatURI(uri).toPartitionFilter(type)); 422 } 423 catch (URISyntaxException e) { 424 throw new RuntimeException("Parsing exception for HCatURI " + uri + ". details: " + e); 425 } 426 } 427 } 428 return filter.toString(); 429 } 430 431 private static HCatURI getURIFromResolved(String dataInName, EventType type) { 432 final XLog LOG = XLog.getLog(HCatELFunctions.class); 433 StringBuilder uriTemplate = new StringBuilder(); 434 ELEvaluator eval = ELEvaluator.getCurrent(); 435 String uris; 436 if(type == EventType.input) { 437 uris = (String) eval.getVariable(".datain." + dataInName); 438 } 439 else { //type=output 440 uris = (String) eval.getVariable(".dataout." + dataInName); 441 } 442 if (uris != null) { 443 String[] uri = HCatURIParser.splitHCatUris(uris, HCAT_URI_PATTERN); 444 uriTemplate.append(uri[0]); 445 } 446 else { 447 LOG.warn("URI is NULL"); 448 return null; 449 } 450 LOG.info("uriTemplate [{0}] ", uriTemplate); 451 HCatURI hcatURI; 452 try { 453 hcatURI = new HCatURI(uriTemplate.toString()); 454 } 455 catch (URISyntaxException e) { 456 LOG.info("uriTemplate [{0}]. Reason [{1}]: ", uriTemplate, e); 457 throw new RuntimeException("HCat URI can't be parsed " + e); 458 } 459 return hcatURI; 460 } 461 462 private static boolean isValidDataEvent(String dataInName) { 463 ELEvaluator eval = ELEvaluator.getCurrent(); 464 String val = (String) eval.getVariable("oozie.dataname." + dataInName); 465 if (val == null || (val.equals("data-in") == false && val.equals("data-out") == false)) { 466 XLog.getLog(HCatELFunctions.class).error("dataset name " + dataInName + " is not valid. val :" + val); 467 throw new RuntimeException("data set name " + dataInName + " is not valid"); 468 } 469 return true; 470 } 471 472 private static String echoUnResolved(String functionName, String n) { 473 return echoUnResolvedPre(functionName, n, "coord:"); 474 } 475 476 private static String echoUnResolvedPre(String functionName, String n, String prefix) { 477 ELEvaluator eval = ELEvaluator.getCurrent(); 478 eval.setVariable(".wrap", "true"); 479 return prefix + functionName + "(" + n + ")"; // Unresolved 480 } 481 482}