define() allows one to operate on data using one or more formula-based definitions. Data transformation and selection can be achieved with formulas or using standard data.table syntax in a procedural manner with a single function call. Each operation operates on the previous result making define() a pipeline of operations in a compact framework.

define(data = NULL, ..., keep.rownames = TRUE, blueprint = NULL)

Arguments

data

The input data (see 'Details')

...

(dots_list): Formulas for which the left-hand side (LHS) is an expression containing the operation, and the right-hand side (RHS) contains column names that form a grouping set for the operation (i.e., <expression> ~ col_1 + col_2 + ...):

  • If the form <LHS>~ . is given, the LHS executes using all columns as the grouping set

  • If the form <LHS>~ 1 is given, the LHS executes without grouping

  • If no LHS is given, the operation defaults to selection based on the RHS

keep.rownames

See data.table

blueprint

(EXPERIMENTAL) See blueprint.

Value

The data modified

Details

  • If data is a smart.data object, the taxonomical references to field names can be accessed by using the use() syntax in the right-hand side of the formula (e.g., ~use(term1, term2) + otherTerm1 + ...).

  • When using formulas for ... on an empty data.table object, set the argument data to NULL; otherwise, you will get an error.

Examples

if (require(smart.data)){
  library(smart.data)
  smart.start();

  taxonomy_list <- list(
    identifier = taxonomy(
      term = "identifier"
      , desc = "Identifier"
      , fields = c("rn"))
    , category = taxonomy(
      term = "category"
      , desc = "Category"
      , fields = c("cyl", "gear", "carb")
      )
    );

  smart_mt <- smart.data$
    new(as.data.table(mtcars[1:10, ], keep.rownames = TRUE))$
    taxonomy.rule(!!!taxonomy_list)$
    enforce.rules(for_usage)$
    cache_mgr(action = upd);

  print(test_obj <- define(
    smart_mt
    , list(j = 1, mpg) ~vs + am + use(identifier, category)
    , ~j + mpg
    , keep.rownames = FALSE
    ))

  print(test_obj <- define(smart_mt, ~vs + am + use(identifier, category)));

  print(define(
    test_obj
    , `:=`(x = sum(am^2), y = 10) ~ use(identifier, category)
    ));

  print(test_obj)

  rm(test_obj)
}
#> Loading required package: smart.data
#> Loading required package: magrittr
#> 'new_data' is not found in the smart-cache
#>         j   mpg
#>     <num> <num>
#>  1:     1  21.0
#>  2:     1  21.0
#>  3:     1  22.8
#>  4:     1  21.4
#>  5:     1  18.7
#>  6:     1  18.1
#>  7:     1  14.3
#>  8:     1  24.4
#>  9:     1  22.8
#> 10:     1  19.2
#>        vs    am                rn   cyl  gear  carb
#>     <num> <num>            <char> <num> <num> <num>
#>  1:     0     1         Mazda RX4     6     4     4
#>  2:     0     1     Mazda RX4 Wag     6     4     4
#>  3:     1     1        Datsun 710     4     4     1
#>  4:     1     0    Hornet 4 Drive     6     3     1
#>  5:     0     0 Hornet Sportabout     8     3     2
#>  6:     1     0           Valiant     6     3     1
#>  7:     0     0        Duster 360     8     3     4
#>  8:     1     0         Merc 240D     4     4     2
#>  9:     1     0          Merc 230     4     4     2
#> 10:     1     0          Merc 280     6     4     4
#>        vs    am                rn   cyl  gear  carb     x     y
#>     <num> <num>            <char> <num> <num> <num> <num> <num>
#>  1:     0     1         Mazda RX4     6     4     4     3    10
#>  2:     0     1     Mazda RX4 Wag     6     4     4     3    10
#>  3:     1     1        Datsun 710     4     4     1     3    10
#>  4:     1     0    Hornet 4 Drive     6     3     1     3    10
#>  5:     0     0 Hornet Sportabout     8     3     2     3    10
#>  6:     1     0           Valiant     6     3     1     3    10
#>  7:     0     0        Duster 360     8     3     4     3    10
#>  8:     1     0         Merc 240D     4     4     2     3    10
#>  9:     1     0          Merc 230     4     4     2     3    10
#> 10:     1     0          Merc 280     6     4     4     3    10
#>        vs    am                rn   cyl  gear  carb
#>     <num> <num>            <char> <num> <num> <num>
#>  1:     0     1         Mazda RX4     6     4     4
#>  2:     0     1     Mazda RX4 Wag     6     4     4
#>  3:     1     1        Datsun 710     4     4     1
#>  4:     1     0    Hornet 4 Drive     6     3     1
#>  5:     0     0 Hornet Sportabout     8     3     2
#>  6:     1     0           Valiant     6     3     1
#>  7:     0     0        Duster 360     8     3     4
#>  8:     1     0         Merc 240D     4     4     2
#>  9:     1     0          Merc 230     4     4     2
#> 10:     1     0          Merc 280     6     4     4

define()[];
#> Null data.table (0 rows and 0 cols)
define(x = 1:10, y = x * 3)[];
#>         x     y
#>     <int> <num>
#>  1:     1     3
#>  2:     2     6
#>  3:     3     9
#>  4:     4    12
#>  5:     5    15
#>  6:     6    18
#>  7:     7    21
#>  8:     8    24
#>  9:     9    27
#> 10:    10    30
define(x = 1:10, y = x * 3, z = x*y)[];
#>         x     y     z
#>     <int> <num> <num>
#>  1:     1     3     3
#>  2:     2     6    12
#>  3:     3     9    27
#>  4:     4    12    48
#>  5:     5    15    75
#>  6:     6    18   108
#>  7:     7    21   147
#>  8:     8    24   192
#>  9:     9    27   243
#> 10:    10    30   300
define(NULL, x = 1:10, y = x * 3, z = x*y, ~x + z)[];
#>         x     z
#>     <int> <num>
#>  1:     1     3
#>  2:     2    12
#>  3:     3    27
#>  4:     4    48
#>  5:     5    75
#>  6:     6   108
#>  7:     7   147
#>  8:     8   192
#>  9:     9   243
#> 10:    10   300
define(data.table(), x = 1:10, y = x * 3, list(z = 10) ~ x)[];
#>         x     z
#>     <int> <num>
#>  1:     1    10
#>  2:     2    10
#>  3:     3    10
#>  4:     4    10
#>  5:     5    10
#>  6:     6    10
#>  7:     7    10
#>  8:     8    10
#>  9:     9    10
#> 10:    10    10

# Predefined operations:
predef_data <- define(smart_mt, x = sum(am^2) ~ use(identifier, category));

redef_data <- define(
  smart_mt
  # Normally, listing 'x' would throw an error as it would not exist in the data;
  # however, since a blueprint is provided from a previous definition, the 'x' variable is
  # created within scope before the additional operations execute:
  , list(j = 1, mpg, x) ~vs + am + use(identifier, category)
  , blueprint = predef_data
  );

redef_data;
#>        vs    am                rn   cyl  gear  carb     j   mpg     x
#>     <num> <num>            <char> <num> <num> <num> <num> <num> <num>
#>  1:     0     1         Mazda RX4     6     4     4     1  21.0     1
#>  2:     0     1     Mazda RX4 Wag     6     4     4     1  21.0     1
#>  3:     1     1        Datsun 710     4     4     1     1  22.8     1
#>  4:     1     0    Hornet 4 Drive     6     3     1     1  21.4     0
#>  5:     0     0 Hornet Sportabout     8     3     2     1  18.7     0
#>  6:     1     0           Valiant     6     3     1     1  18.1     0
#>  7:     0     0        Duster 360     8     3     4     1  14.3     0
#>  8:     1     0         Merc 240D     4     4     2     1  24.4     0
#>  9:     1     0          Merc 230     4     4     2     1  22.8     0
#> 10:     1     0          Merc 280     6     4     4     1  19.2     0

attr(redef_data, "blueprint");
#> An object of class "blueprint"
#> Slot "schema":
#> $x
#> sum(am^2) ~ rn + cyl + gear + carb
#> <environment: 0x0000019ef2bdfd08>
#> 
#> [[2]]
#> list(j = 1, mpg, x) ~ vs + am + rn + cyl + gear + carb
#> <environment: 0x0000019ef3816460>
#> 
#> 

identical(redef_data, define(smart_mt, blueprint = redef_data))
#> [1] TRUE

if ("smart.data" %in% loadedNamespaces()){
  detach("package:smart.data", unload = TRUE)
}

rm(redef_data, smart_mt, predef_data, taxonomy_list)