Subjectively

dd if=/dev/random | kirk > blog

Subjectively header image 2

Examine the most common values in every column in a PostgreSQL table

October 31st, 2013 · No Comments · PostgreSQL

Here’s a shell script that pivots a PostgreSQL table by the frequency of values in the column contents. It helps me find indicator flags in a poorly described data source by showing me the most frequently used values in a column. It can also be really handy to see the distribution of values to approximate the usefulness of an index on a column or multiple columns.
The resulting data has 2 columns for every column in the table being examined. For this table:

1
2
3
4
5
6
CREATE TABLE employee (
  id serial PRIMARY KEY,
  active_flg BOOLEAN,
  city text,
  ...
  );

The resulting analysis looks like this:

ordinal active_flg_contents active_flg_count city_content city_count
1 true 75 Irving 51
2 false 4 Falls Church 4
3 Dallas 2
4 Lithia Springs 2
5 Spring 2
6 Carrollton 1
7 Fort Worth 1
8 Aledo 1
9 New York 1
10 Trussville 1
11 Wylie 1
12 Wentzville 1
13 North Charleston 1
14 Charlotte 1
15 Miami 1
16 Sachse 1
17 Grafton 1
18 Lebanon 1
19 Plano 1
20 San Antonio 1
21 Castle Rock 1
22 Fairfax 1
23 Cave Creek 1

Here’s the shell script. It produces 2 tables [mytable]_forensic and [mytable]_aggregate_pivot. There are 2 columns in _aggregate_pivot for every column in [mytable]. The first one holds the value, and the second one is the frequency of that value in the original table.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/bin/bash - 
#===============================================================================
#
#          FILE: pg_examine_cardinality.sh
# 
#         USAGE: ./pg_examine_cardinality.sh 
# 
#   DESCRIPTION: Create a table called [tablename]_forensic that transforms the data into name/value pairs.
#		  Then transform [tablename]_forensic into [tablename]_aggregate_pivot
#			 to show the top 1000 values for each column.
# 
#       OPTIONS: ---
#  REQUIREMENTS: ---
#          BUGS: ---
#         NOTES: ---
#        AUTHOR: Kirk Roybal (), kirk@webfinish.com
#  ORGANIZATION: 
#       CREATED: 10/30/2013 11:57
#      REVISION:  ---
#===============================================================================
 
set -o nounset                              # Treat unset variables as an error
 
ScriptVersion="1.0"
 
database="ods"
host="dpggen"
user="adservices"
schema="public"
 
#===  FUNCTION  ================================================================
#         NAME:  usage
#  DESCRIPTION:  Display usage information.
#===============================================================================
function usage ()
{
	cat < 2 just turn on bash debugging
[[ $log -gt 2 ]] && {
	set -x
	# turn off simple log, you're gonna see it anyway
	log=
}
 
# -l not used or bash debugging turned on
[[ "$log" == '' ]] && unset log
 
# idempotent script cleans before run
sql="DROP TABLE IF EXISTS ${schema}.${table}_forensic"
 
[[ -v log ]] && {
	echo "DROPPING ${schema}.${table}_forensic"
	[[ $log -gt 1 ]] && echo "$sql"
}
 
psql -h $host -U $user $database -qtAc "$sql"
 
#Create the name/value pair table
sql="SELECT 'CREATE TABLE ${schema}.${table}_forensic AS ' || array_to_string(array(
SELECT 'SELECT ' || column_name || '::text column_content,  ''' || column_name || ''' column_name, count(*) column_count FROM ${schema}.$table GROUP BY 1' FROM INFORMATION_SCHEMA.COLUMNS
WHERE table_catalog = '${database}' and table_schema = '${schema}' AND table_name = '$table' AND column_name NOT IN ('load_date', 'id', 'application_id', 'account_master_id')), E'\nUNION\n') || ';';"
 
[[ -v log ]] && {
	echo "CREATING ${schema}.${table}_forensic CREATE STATEMENT"
	[[ $log -gt 1 ]] && echo "$sql"
}
 
sql=$(psql -h $host -U $user $database -qtAc "$sql")
 
[[ -v log ]] && {
	echo "CREATING ${schema}.${table}_forensic"
	[[ $log -gt 1 ]] && echo "$sql"
}
 
psql -h $host -U $user $database -qtAc "$sql"
 
#add the table count to it.
sql="SELECT COUNT(*) FROM ${schema}.$table;"
 
[[ -v log ]] && {
	echo "COUNTING ${schema}.${table}"
	[[ $log -gt 1 ]] && echo "$sql"
}
 
table_count=$(psql -h $host -U $user $database -qtAc "$sql")
 
sql="ALTER TABLE ${schema}.${table}_forensic ADD COLUMN table_count bigint NOT NULL DEFAULT $table_count,
	ADD COLUMN id bigserial primary key,
	ADD COLUMN column_ordinal integer;"
 
[[ -v log ]] && {
	echo "ADDING id, column_ordinal, table_count TO ${schema}.${table}_forensic"
	[[ $log -gt 1 ]] && echo "$sql"
}
 
psql -h $host -U $user $database -qtAc "$sql"
 
#add an ordinal to indicate order of frequency
sql="UPDATE ${schema}.${table}_forensic lmf
	SET column_ordinal = a.rank_order
FROM (
SELECT id, row_number() OVER (PARTITION BY column_name ORDER BY column_name, column_count DESC) rank_order
FROM ${schema}.${table}_forensic ) a
WHERE lmf.id = a.id;"
 
[[ -v log ]] && {
	echo "SETTING ${schema}.${table}_forensic column_ordinal"
	[[ $log -gt 1 ]] && echo "$sql"
}
 
psql -h $host -U $user $database -qtAc "$sql"
 
#index the name/value pair table so it doesn't take a week to create the pivot
sql="CREATE INDEX idx_${table}_forensic_cn_cc ON ${schema}.${table}_forensic(column_name, column_content);
	CLUSTER ${schema}.${table}_forensic USING idx_${table}_forensic_cn_cc;"
 
[[ -v log ]] && {
	echo "CREATING index  idx_${table}_forensic_cn_cc ON ${schema}.${table}_forensic"
	[[ $log -gt 1 ]] && echo "$sql"
}
 
psql -h $host -U $user $database -qtAc "$sql"
 
#more idempotency
sql="DROP TABLE IF EXISTS ${schema}.${table}_aggregate_pivot"
 
[[ -v log ]] && {
	echo "DROPPING ${schema}.${table}_aggregate_pivot"
	[[ $log -gt 1 ]] && echo "$sql"
}
 
psql -h $host -U $user $database -qtAc "$sql"
 
sql="SELECT least(1000,max(column_ordinal))
FROM ${schema}.${table}_forensic;"
 
colcount=$(psql -h $host -U $user $database -qtAc "$sql")
 
#pivot the data
sql="--provide a rank, column names to select, and the SELECT statement to get them
WITH the_data AS (SELECT row_number() OVER () rank_order, column_name || '_column_content, ' || column_name || '_column_count' cols,
	'(SELECT column_content ' || column_name || '_column_content, column_count ' ||column_name || '_column_count, column_ordinal 
	  FROM ${schema}.${table}_forensic WHERE column_name = ''' || column_name || ''' AND column_ordinal <= $colcount) ' sql
FROM (
SELECT DISTINCT column_name
FROM ${schema}.${table}_forensic
-- Eliminate columns that have only one value for all rows, a unique value for every row, and some control columns we already know about
WHERE column_count <> table_count AND column_count <> 1 AND column_name NOT IN ('account_master_id', 'account_number', 'application_id', 'id')
ORDER BY 1) a)
-- create two recordsets with a single column each
, field_list AS (SELECT 'o, ' || array_to_string(array(SELECT cols FROM the_data), E', ') fields)
, data_list AS (SELECT array_to_string(array(SELECT sql || 't' || rank_order || E'\nON o = t' || rank_order || E'.column_ordinal' FROM the_data),E'\nLEFT JOIN\n'))
-- assemble the final query SELECT fields FROM series JOIN field1 JOIN field2 JOIN field3 ...
SELECT 'SELECT ' || (SELECT fields FROM field_list) || E'\nFROM generate_series(1,$colcount) o\nLEFT JOIN\n' || (SELECT * FROM data_list);"
 
[[ -v log ]] && {
	echo "CREATE SELECT for ${schema}.${table}_aggregate_pivot"
	[[ $log -gt 1 ]] && echo "$sql"
}
 
#create a table with the pivoted value frequency data
sql="CREATE TABLE ${schema}.${table}_aggregate_pivot AS $(psql -h $host -U $user $database -qtAc "$sql")"
 
[[ -v log ]] && {
	echo "CREATE TABLE for ${schema}.${table}_aggregate_pivot"
	[[ $log -gt 1 ]] && echo "$sql"
}
 
psql -h $host -U $user $database -qtAc "$sql"
 
#add a primary key so we can look at it in pgAdmin3
sql="ALTER TABLE ${schema}.${table}_aggregate_pivot ADD COLUMN id bigserial primary key;"
 
[[ -v log ]] && {
	echo "ADDING ${schema}.${table}_aggregate_pivot id primary key"
	[[ $log -gt 1 ]] && echo "$sql"
}
 
psql -h $host -U $user $database -qtAc "$sql"

Tags:

No Comments so far ↓

There are no comments yet...Kick things off by filling out the form below.

Leave a Comment

You must log in to post a comment.