close
close
Effortless data processing: Find variables in multiple data files with R | by Rodrigo M Carrillo Larco, MD, PhD | November 2024

library(port)
Library(tidyverse)
library(stringr)

## STEPS TO USE THESE FEATURES:
## 1. DEFINE THE OBJECT “PATH_FILE” WHICH IS A PATH TO THE DIRECTORY WHERE
## ALL RECORDS ARE SAVED.
## 2. Apply get_names_labels function with the path. THE FUNCTION WILL
## RETURN A DATAFRAME NAMES ‘names_labels’.
## 3. THE FUNCTION WILL RETURN A DATASET (“names_labels”) SHOWING THE NAMES OF
## THE VARIABLES, THE LABELS AND THE DATASET. DISCOVER VISUALLY/MANUALLY
## DATASET TO SELECT THE VARIABLES WE NEED. CREATE A VECTOR WITH THE NAMES
## OF THE VARIABLES WE NEED AND NAME THIS VECTOR “variables_needed”.
## 4. FROM THE DATASET “names_labels” ONLY KEEP THE ROWS WITH THE VARIABLES WE
## USED (SAVED IN VECTOR “variables_needed”).
## 5. Apply the read_and_select function to each relevant record
## VARIABLES. THIS FUNCTION ONLY NEEDS THE RECORD NAME
## STORED IN THE LAST COLUMN OF THE “names_labels” DATASET.

### FUNCTION TO 1) READ ALL RECORDS IN A FOLDER; 2) EXTRACT NAMES AND LABELS;
### 3) INSERT NAMES AND LABELS INTO A RECORD; AND 4) RETURN THE RECORD. THE ONLY
### REQUIRED INPUT IS A PATH TO A DIRECTORY WHERE ALL RECORDS ARE STORED.

get_names_labels <- function(path_file){
results_df <- list()

sas_files <- c(
list.files(path = path_file, pattern = “\\.sas7bdat$”)
)

for (i in 1:length(sas_files)) {
print(sas_files(i))

# Read the SAS file
sas_data <- read_sas(paste0(path_file, sas_files(i)))
sas_data <- as.data.frame(sas_data)

# Get the variable names and labels
var_names <- name(sas_data)
Labels <- sas_data %>%
map(~attributes(.)$label) %>%
map_chr(~ifelse(is.null(.), NA, .))

# Combine the variable names and labels into one data frame
var_df <- data.frame(
variable_name = var_names,
variable_label = labels,
filename = sas_files(i),
stringsAsFactors = FALSE
)

# Append the results to the entire data frame
results_df((i)) <- var_df
}

results_df <- do.call(rbind, results_df)

#return(results_df)
assign(‘names_labels’, results_df, envir = .GlobalEnv)

}

############################################### ### ##########################

### FUNCTION TO READ ANY RECORD AND KEEP ONLY THE VARIABLES WE SELECTED; THE
### FUNCTION SAVES EVERY RECORD IN THE AREA. THE ONLY INPUT IS THIS
### DATASET NAME.

read_and_select <- function(df_file){

df_tmp <- read_sas(paste0(path_file, df_file))

df_tmp <- df_tmp %>%
select(unique(names_labels(which(names_labels$file_name == df_file), )$variable_name)) %>%
as.data.frame()

Assign(str_extract(df_file, “(^.)+”), df_tmp,envir = .GlobalEnv)

}

############################################### ### ##########################

Leave a Reply

Your email address will not be published. Required fields are marked *