Rolling Join

Create some data

library(data.table)

## Create some data ##

person <- c('Randy', 'Anne', 'Sawyer', 'Taylor',
            'Randy', 'Anne', 'Sawyer', 'Taylor',
            'Randy', 'Anne', 'Sawyer', 'Taylor',
            'Randy', 'Anne', 'Sawyer', 'Taylor')

sale_date <- c('2022-01-10', '2022-02-22', '2022-01-11', '2022-03-01',
               '2022-01-12', '2022-02-26', '2022-01-14', '2022-03-06',
               '2022-02-10', '2022-02-27', '2022-01-15', '2022-03-07')

sale_item <- c('item1', 'item2', 'item3', 'item4',
               'item2', 'item3', 'item1', 'item3',
               'item4', 'item2', 'item3', 'item1')

person <- c('Randy', 'Anne', 'Sawyer', 'Taylor',
            'Randy', 'Anne', 'Sawyer', 'Taylor',
            'Randy', 'Anne', 'Sawyer', 'Taylor')
ad_date <- c('2022-01-01', '2022-01-01', '2022-01-01', '2022-01-01',
             '2022-02-01', '2022-02-01', '2022-02-01', '2022-02-01',
             '2022-032-01', '2022-03-01', '2022-03-01', '2022-03-01')

sales_data <- data.frame(person, sale_date, sale_item)
ad_data <- data.frame(person, ad_date)


sales_data
##    person  sale_date sale_item
## 1   Randy 2022-01-10     item1
## 2    Anne 2022-02-22     item2
## 3  Sawyer 2022-01-11     item3
## 4  Taylor 2022-03-01     item4
## 5   Randy 2022-01-12     item2
## 6    Anne 2022-02-26     item3
## 7  Sawyer 2022-01-14     item1
## 8  Taylor 2022-03-06     item3
## 9   Randy 2022-02-10     item4
## 10   Anne 2022-02-27     item2
## 11 Sawyer 2022-01-15     item3
## 12 Taylor 2022-03-07     item1
ad_data
##    person     ad_date
## 1   Randy  2022-01-01
## 2    Anne  2022-01-01
## 3  Sawyer  2022-01-01
## 4  Taylor  2022-01-01
## 5   Randy  2022-02-01
## 6    Anne  2022-02-01
## 7  Sawyer  2022-02-01
## 8  Taylor  2022-02-01
## 9   Randy 2022-032-01
## 10   Anne  2022-03-01
## 11 Sawyer  2022-03-01
## 12 Taylor  2022-03-01

Perform rolling join using data.table. You will first need to set the key values for each dataset. The last variable in the dataset will be the variable that the join rolls across. Oftentimes, this is a date field. When joined, only one of the “date” columns will be kept, so I tend to duplicate the original fields in each table to ensure I don’t lose either.

library(data.table)

## Make sure both data.frames are structured as data.tables##

sales_data <- data.table(sales_data)
ad_data <- data.table(ad_data)

## Create duplicated date values to join on.  This will help ensure we don't lose the date from either data.table once joined.

## Make sure date fields are dates to ensure proper matching
sales_data$sale_date <- as.Date(sales_data$sale_date)
ad_data$ad_date <- as.Date(ad_data$ad_date)

sales_data$join_date <- as.Date(sales_data$sale_date)
ad_data$join_date <- as.Date(ad_data$ad_date)

## Set the "Key" columns for each data.table

keycols <- c("person", "join_date")

setkeyv(sales_data, keycols)
setkeyv(ad_data, keycols)

Now that the data is created and structured, the join is quite simple and quick.

## Use a rolling join to match each sale to preceding ad
## you can use rollends if you would like to set limits on the joining (i.e. limit how many days before/after to match) -- rollends=c(1,999) If you want to roll "backwards", you can use roll=-Inf (you can replace "Inf" with the number of days to look)

matched_sales <- sales_data[ad_data, roll=-Inf]

## Match the other direction too!

matched_ads <- ad_data[sales_data, roll=T]

## Once matched, it may be useful to add a column to show how many days it took to reach the match

matched_sales$days_to_purchase <- as.numeric(matched_sales$ad_date) - as.numeric(matched_sales$sale_date)

matched_ads$days_since_ad <- as.numeric(matched_ads$sale_date) - as.numeric(matched_ads$ad_date)

matched_sales
##     person  sale_date sale_item  join_date    ad_date days_to_purchase
##  1:   Anne 2022-02-22     item2 2022-01-01 2022-01-01              -52
##  2:   Anne 2022-02-22     item2 2022-02-01 2022-02-01              -21
##  3:   Anne       <NA>      <NA> 2022-03-01 2022-03-01               NA
##  4:  Randy 2022-01-10     item1       <NA>       <NA>               NA
##  5:  Randy 2022-01-10     item1 2022-01-01 2022-01-01               -9
##  6:  Randy 2022-02-10     item4 2022-02-01 2022-02-01               -9
##  7: Sawyer 2022-01-11     item3 2022-01-01 2022-01-01              -10
##  8: Sawyer       <NA>      <NA> 2022-02-01 2022-02-01               NA
##  9: Sawyer       <NA>      <NA> 2022-03-01 2022-03-01               NA
## 10: Taylor 2022-03-01     item4 2022-01-01 2022-01-01              -59
## 11: Taylor 2022-03-01     item4 2022-02-01 2022-02-01              -28
## 12: Taylor 2022-03-01     item4 2022-03-01 2022-03-01                0
matched_ads
##     person    ad_date  join_date  sale_date sale_item days_since_ad
##  1:   Anne 2022-02-01 2022-02-22 2022-02-22     item2            21
##  2:   Anne 2022-02-01 2022-02-26 2022-02-26     item3            25
##  3:   Anne 2022-02-01 2022-02-27 2022-02-27     item2            26
##  4:  Randy 2022-01-01 2022-01-10 2022-01-10     item1             9
##  5:  Randy 2022-01-01 2022-01-12 2022-01-12     item2            11
##  6:  Randy 2022-02-01 2022-02-10 2022-02-10     item4             9
##  7: Sawyer 2022-01-01 2022-01-11 2022-01-11     item3            10
##  8: Sawyer 2022-01-01 2022-01-14 2022-01-14     item1            13
##  9: Sawyer 2022-01-01 2022-01-15 2022-01-15     item3            14
## 10: Taylor 2022-03-01 2022-03-01 2022-03-01     item4             0
## 11: Taylor 2022-03-01 2022-03-06 2022-03-06     item3             5
## 12: Taylor 2022-03-01 2022-03-07 2022-03-07     item1             6