A Python/R cheat sheet for SQL developers. The variable “df” is the data frame / table name. For python code, pandas has been imported as pd, and numpy as np. For R, some code assumes dplyr is loaded via libary(dplyr).
Select individual column
select column_name from df
df.column_name # Alternative df['column_name'] # Using a variable myvar = 'column_name' df[myvar]
df$column_name # Alternative, needed if column name has spaces/weird characters df$`column_name` # Alternative df[, 'column_name'] # Using a variable myvar <- 'column_name' df[, myvar]
Select multiple columns
select column_a, column_b, column_c from df
# By name df[['column_a', 'column_b', 'column_c']] # By index, where they are the first three columns df.iloc[:, 0:4]
# By name df[, c('column_a', 'column_b', 'column_c')] # By name using dplyr df %>% select(column_a, column_b, column_c) # By index, where they are the first three columns df[, 1:3]
Top 10 rows
select * from df limit 10
df.head(10)
# standard way head(df, 10) # with dplyr df %>% head(10)
Unique Values and count for a column
# Just the values select distinct(column_name) from df # Values with a count select column_name, count(*) from df group by column_name
# Just the values df.column_name.unique() # Values with a count df.column_name.value_counts()
# Just the values unique(df$column_name) # Values with a count table(df$column_name) # Values with a count using dplyr df %>% select(column_name) %>% count(column_name)
Filtering by column value
# Equal to a specified value select * from df where col_name = 'value' # Equal to a value in a set select * from df where col_name in ('value1', 'value2') # Value is empty select * from df where col_name is null # Find a word within a string select * from df where col_string like '%word%' # Filter by numeric value greater than 100 select * from df where col_numeric > 100
# Equal to a specified value df[df.col_name == 'value'] # Equal to a value in a set df[df.col_name.isin(['value1', 'value2'])] # Value is empty df[df.col_name.isnull()] # Find a word within a string df[df.col_string.str.contains('word') == True] # Filter by numeric value greater than 100 df[df.col_numeric > 100]
# Equal to a specified value df[df$col_name == 'value', ] # Equal to a value in a set df[df$col_name %in% c('value1', 'value2'), ] # Value is empty df[is.na(df$col_name), ] # Find a word within a string df[grepl('word', df$col_string, fixed=T), ] # Filter by numeric value greater than 100 df[df$col_name > 100, ]
Multi column and multi table/dataframe conditions
# Condition between two columns of same table select * from df where col_a != col_b # Inner join between two tables select df1.col_a, df1.col_b, df2.col_x, df2.col_y from df1, df2 where df1.col_common = df2.col_common
# Condition between two columns of same table df[df['col_a'] != df['col_b']] # Inner join between two tables pd.merge(df1, df2, on='col_common', how='inner')[['col_a', 'col_b', 'col_x', 'col_y']]
# Condition between two columns of same table df[df$col_a != df$col_b, ] # Inner join between two tables (uses dplyr) merge(df1, df2, by="col_common") %>% select(col_a, col_b, col_x, col_y)
Update and Alter Statements
# Setting values that are null to fill_value update df set column_name = 'fill_value' where column_name is null # Setting values conditionally update df set column_name = 'new_value' where column_name in ('old_value1', 'old_value2') # Conditional delete of rows delete from df where column_name = 'some_value' # Create copy of column old_column in new column called new_column # (Oracle syntax - other DBs differ) alter table df add (new_column varchar2(50)); update df set new_column = old_column; # Renaming a column (Oracle syntax - other DBs differ) alter table df rename column "old column name" "newcolumnname"
# Setting values that are null to fill_value df['column_name'] = df['column_name'].fillna('fill_value') # Setting values conditionally df.loc[df['column_name'].isin(['old_value1', 'old_value2']), 'column_name'] = 'new_value' # Conditional delete of rows df.drop(np.where(df['column_name'] == 'some_value')[0], inplace=True) # Create copy of column old_column in new column called new_column df['new_column'] = df['old_column'] # Renaming a column df.rename(columns = {'old column name':'newcolumnname'}, inplace=True)
# Setting values that are null to fill_value df$column_name[is.na(df$column_name)] <- 'fill_value' # Setting values conditionally df$column_name[df$column_name %in% c('old_value1', 'old_value2')] <- 'new_value' # Conditional delete of rows df <- df[df$column_name != 'some_value',] # Create copy of column old_column in new column called new_column df$new_column <- df$old_column # Create copy of column old column in new column called new_column (i.e. deal with spaces) df$new_column = df$`old column` # Renaming a column names(df)[names(df) == "old column name"] <- "newcolumnname" # Renaming a column with dplyr df <- df %>% rename("newcolumname" = "old column name")