Revisions to Stratified random sampling with BigQuery?

added 527 characters in body

Source Link

edited Oct 20, 2018 at 3:58

reputation score 59853
23
23 gold badges
187
187 silver badges
367
367 bronze badges

SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 1/100 
)
GROUP BY 2
ORDER BY 1 DESC
LIMIT 100

SELECT COUNT(*) samples, psample.category, ROUND(100*COUNT(*)/ANY_VALUE(c),2) percentage
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND()) cat_samples, category, ANY_VALUE(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
), UNNEST(cat_samples) psample WITH OFFSET off
WHERE off<0.02*c
GROUP BY 2

If this last approach is what you want, you might notice it failing when you actually want to get data out. An early LIMIT similar to the largest group size will make sure we don't sort more data than needed:

SELECT sample.*
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND() LIMIT 105000) cat_samples, category, ANY_VALUE(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
), UNNEST(cat_samples) sample WITH OFFSET off
WHERE off<0.02*c

SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 1/100 
)
GROUP BY 2
ORDER BY 1 DESC
LIMIT 100

SELECT COUNT(*) samples, p.category, ROUND(100*COUNT(*)/ANY_VALUE(c),2) percentage
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND()) cat_samples, category, ANY_VALUE(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
), UNNEST(cat_samples) p WITH OFFSET off
WHERE off<0.02*c
GROUP BY 2

SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 1/100 
)
GROUP BY 2

SELECT COUNT(*) samples, sample.category, ROUND(100*COUNT(*)/ANY_VALUE(c),2) percentage
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND()) cat_samples, category, ANY_VALUE(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
), UNNEST(cat_samples) sample WITH OFFSET off
WHERE off<0.02*c
GROUP BY 2

If this last approach is what you want, you might notice it failing when you actually want to get data out. An early LIMIT similar to the largest group size will make sure we don't sort more data than needed:

SELECT sample.*
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND() LIMIT 105000) cat_samples, category, ANY_VALUE(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
), UNNEST(cat_samples) sample WITH OFFSET off
WHERE off<0.02*c

added 495 characters in body

Source Link

edited Oct 20, 2018 at 3:48

Felipe Hoffa

reputation score 59853
23
23 gold badges
187
187 silver badges
367
367 bronze badges

SELECT ARRAY_LENGTH(cat_samples) samples, category, ROUND(100*ARRAY_LENGTH(cat_samples)/c,2) percentage
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND() LIMIT 20000) cat_samples, category, MAXANY_VALUE(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
)

If you want exactly 2% of each group:

SELECT COUNT(*) samples, p.category, ROUND(100*COUNT(*)/ANY_VALUE(c),2) percentage
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND()) cat_samples, category, ANY_VALUE(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
), UNNEST(cat_samples) p WITH OFFSET off
WHERE off<0.02*c
GROUP BY 2

SELECT ARRAY_LENGTH(cat_samples) samples, category, ROUND(100*ARRAY_LENGTH(cat_samples)/c,2) percentage
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND() LIMIT 20000) cat_samples, category, MAX(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
)

SELECT ARRAY_LENGTH(cat_samples) samples, category, ROUND(100*ARRAY_LENGTH(cat_samples)/c,2) percentage
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND() LIMIT 20000) cat_samples, category, ANY_VALUE(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
)

If you want exactly 2% of each group:

SELECT COUNT(*) samples, p.category, ROUND(100*COUNT(*)/ANY_VALUE(c),2) percentage
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND()) cat_samples, category, ANY_VALUE(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
), UNNEST(cat_samples) p WITH OFFSET off
WHERE off<0.02*c
GROUP BY 2

added 459 characters in body

Source Link

edited Oct 20, 2018 at 3:24

Felipe Hoffa

reputation score 59853
23
23 gold badges
187
187 silver badges
367
367 bronze badges

With #standardSQL, let's define our table and some stats over it:

WITH table AS (
  SELECT *, subreddit category
  FROM `fh-bigquery.reddit_comments.2018_09` a
), table_stats AS (
  SELECT *, SUM(c) OVER() total 
  FROM (
    SELECT category, COUNT(*) c 
    FROM table
    GROUP BY 1 
    HAVING c>1000000)
)

In this setup:

subreddit will be our category
we only want subreddits with more than 1000000 comments

So, if we want 1% of each category in our sample:

SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 1/100 
)
GROUP BY 2
ORDER BY 1 DESC
LIMIT 100

Or let's say we want ~80,000 samples - but chosen proportionally through all categories:

SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 80000/total
)
GROUP BY 2

Now, if you want to get the ~same number of samples from each group (let's say, 20,000):

SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 20000/c
)
GROUP BY 2

If you want exactly 20,000 elements from each category:

SELECT ARRAY_LENGTH(cat_samples) samples, category, ROUND(100*ARRAY_LENGTH(cat_samples)/c,2) percentage
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND() LIMIT 20000) cat_samples, category, MAX(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
)

With #standardSQL, let's define our table and some stats over it:

WITH table AS (
  SELECT *, subreddit category
  FROM `fh-bigquery.reddit_comments.2018_09` a
), table_stats AS (
  SELECT *, SUM(c) OVER() total 
  FROM (
    SELECT category, COUNT(*) c 
    FROM table
    GROUP BY 1 
    HAVING c>1000000)
)

In this setup:

subreddit will be our category
we only want subreddits with more than 1000000 comments

So, if we want 1% of each category in our sample:

SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 1/100 
)
GROUP BY 2
ORDER BY 1 DESC
LIMIT 100

Or let's say we want ~80,000 samples - but chosen proportionally through all categories:

SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 80000/total
)
GROUP BY 2

Now, if you want to get the ~same number of samples from each group (let's say, 20,000):

SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 20000/c
)
GROUP BY 2

With #standardSQL, let's define our table and some stats over it:

WITH table AS (
  SELECT *, subreddit category
  FROM `fh-bigquery.reddit_comments.2018_09` a
), table_stats AS (
  SELECT *, SUM(c) OVER() total 
  FROM (
    SELECT category, COUNT(*) c 
    FROM table
    GROUP BY 1 
    HAVING c>1000000)
)

In this setup:

subreddit will be our category
we only want subreddits with more than 1000000 comments

So, if we want 1% of each category in our sample:

SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 1/100 
)
GROUP BY 2
ORDER BY 1 DESC
LIMIT 100

Or let's say we want ~80,000 samples - but chosen proportionally through all categories:

SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 80000/total
)
GROUP BY 2

Now, if you want to get the ~same number of samples from each group (let's say, 20,000):

SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 20000/c
)
GROUP BY 2

If you want exactly 20,000 elements from each category:

SELECT ARRAY_LENGTH(cat_samples) samples, category, ROUND(100*ARRAY_LENGTH(cat_samples)/c,2) percentage
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND() LIMIT 20000) cat_samples, category, MAX(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
)

added 428 characters in body

Source Link

edited Oct 20, 2018 at 2:38

Felipe Hoffa

reputation score 59853
23
23 gold badges
187
187 silver badges
367
367 bronze badges

Loading

Source Link

answered Oct 20, 2018 at 0:41

Felipe Hoffa

reputation score 59853
23
23 gold badges
187
187 silver badges
367
367 bronze badges

Loading

Collectives™ on Stack Overflow

Return to Answer

Post Timeline