Skip to main content
The 2026 Annual Developer Survey is live— take the Survey today!
added 527 characters in body
Source Link
Felipe Hoffa
SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 1/100 
)
GROUP BY 2
ORDER BY 1 DESC
LIMIT 100
SELECT COUNT(*) samples, psample.category, ROUND(100*COUNT(*)/ANY_VALUE(c),2) percentage
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND()) cat_samples, category, ANY_VALUE(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
), UNNEST(cat_samples) psample WITH OFFSET off
WHERE off<0.02*c
GROUP BY 2

If this last approach is what you want, you might notice it failing when you actually want to get data out. An early LIMIT similar to the largest group size will make sure we don't sort more data than needed:

SELECT sample.*
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND() LIMIT 105000) cat_samples, category, ANY_VALUE(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
), UNNEST(cat_samples) sample WITH OFFSET off
WHERE off<0.02*c
SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 1/100 
)
GROUP BY 2
ORDER BY 1 DESC
LIMIT 100
SELECT COUNT(*) samples, p.category, ROUND(100*COUNT(*)/ANY_VALUE(c),2) percentage
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND()) cat_samples, category, ANY_VALUE(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
), UNNEST(cat_samples) p WITH OFFSET off
WHERE off<0.02*c
GROUP BY 2
SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 1/100 
)
GROUP BY 2
SELECT COUNT(*) samples, sample.category, ROUND(100*COUNT(*)/ANY_VALUE(c),2) percentage
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND()) cat_samples, category, ANY_VALUE(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
), UNNEST(cat_samples) sample WITH OFFSET off
WHERE off<0.02*c
GROUP BY 2

If this last approach is what you want, you might notice it failing when you actually want to get data out. An early LIMIT similar to the largest group size will make sure we don't sort more data than needed:

SELECT sample.*
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND() LIMIT 105000) cat_samples, category, ANY_VALUE(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
), UNNEST(cat_samples) sample WITH OFFSET off
WHERE off<0.02*c
added 495 characters in body
Source Link
Felipe Hoffa
SELECT ARRAY_LENGTH(cat_samples) samples, category, ROUND(100*ARRAY_LENGTH(cat_samples)/c,2) percentage
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND() LIMIT 20000) cat_samples, category, MAXANY_VALUE(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
)

enter image description here

If you want exactly 2% of each group:

SELECT COUNT(*) samples, p.category, ROUND(100*COUNT(*)/ANY_VALUE(c),2) percentage
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND()) cat_samples, category, ANY_VALUE(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
), UNNEST(cat_samples) p WITH OFFSET off
WHERE off<0.02*c
GROUP BY 2

enter image description here

SELECT ARRAY_LENGTH(cat_samples) samples, category, ROUND(100*ARRAY_LENGTH(cat_samples)/c,2) percentage
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND() LIMIT 20000) cat_samples, category, MAX(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
)

enter image description here

SELECT ARRAY_LENGTH(cat_samples) samples, category, ROUND(100*ARRAY_LENGTH(cat_samples)/c,2) percentage
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND() LIMIT 20000) cat_samples, category, ANY_VALUE(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
)

enter image description here

If you want exactly 2% of each group:

SELECT COUNT(*) samples, p.category, ROUND(100*COUNT(*)/ANY_VALUE(c),2) percentage
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND()) cat_samples, category, ANY_VALUE(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
), UNNEST(cat_samples) p WITH OFFSET off
WHERE off<0.02*c
GROUP BY 2

enter image description here

added 459 characters in body
Source Link
Felipe Hoffa

With #standardSQL, let's define our table and some stats over it:

WITH table AS (
  SELECT *, subreddit category
  FROM `fh-bigquery.reddit_comments.2018_09` a
), table_stats AS (
  SELECT *, SUM(c) OVER() total 
  FROM (
    SELECT category, COUNT(*) c 
    FROM table
    GROUP BY 1 
    HAVING c>1000000)
)

In this setup:

  • subreddit will be our category
  • we only want subreddits with more than 1000000 comments

So, if we want 1% of each category in our sample:

SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 1/100 
)
GROUP BY 2
ORDER BY 1 DESC
LIMIT 100

enter image description here

Or let's say we want ~80,000 samples - but chosen proportionally through all categories:

SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 80000/total
)
GROUP BY 2

enter image description here

Now, if you want to get the ~same number of samples from each group (let's say, 20,000):

SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 20000/c
)
GROUP BY 2

enter image description here

If you want exactly 20,000 elements from each category:

SELECT ARRAY_LENGTH(cat_samples) samples, category, ROUND(100*ARRAY_LENGTH(cat_samples)/c,2) percentage
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND() LIMIT 20000) cat_samples, category, MAX(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
)

enter image description here

With #standardSQL, let's define our table and some stats over it:

WITH table AS (
  SELECT *, subreddit category
  FROM `fh-bigquery.reddit_comments.2018_09` a
), table_stats AS (
  SELECT *, SUM(c) OVER() total 
  FROM (
    SELECT category, COUNT(*) c 
    FROM table
    GROUP BY 1 
    HAVING c>1000000)
)

In this setup:

  • subreddit will be our category
  • we only want subreddits with more than 1000000 comments

So, if we want 1% of each category in our sample:

SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 1/100 
)
GROUP BY 2
ORDER BY 1 DESC
LIMIT 100

enter image description here

Or let's say we want ~80,000 samples - but chosen proportionally through all categories:

SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 80000/total
)
GROUP BY 2

enter image description here

Now, if you want to get the ~same number of samples from each group (let's say, 20,000):

SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 20000/c
)
GROUP BY 2

enter image description here

With #standardSQL, let's define our table and some stats over it:

WITH table AS (
  SELECT *, subreddit category
  FROM `fh-bigquery.reddit_comments.2018_09` a
), table_stats AS (
  SELECT *, SUM(c) OVER() total 
  FROM (
    SELECT category, COUNT(*) c 
    FROM table
    GROUP BY 1 
    HAVING c>1000000)
)

In this setup:

  • subreddit will be our category
  • we only want subreddits with more than 1000000 comments

So, if we want 1% of each category in our sample:

SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 1/100 
)
GROUP BY 2
ORDER BY 1 DESC
LIMIT 100

enter image description here

Or let's say we want ~80,000 samples - but chosen proportionally through all categories:

SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 80000/total
)
GROUP BY 2

enter image description here

Now, if you want to get the ~same number of samples from each group (let's say, 20,000):

SELECT COUNT(*) samples, category, ROUND(100*COUNT(*)/MAX(c),2) percentage
FROM (
  SELECT id, category, c  
  FROM table a
  JOIN table_stats b
  USING(category)
  WHERE RAND()< 20000/c
)
GROUP BY 2

enter image description here

If you want exactly 20,000 elements from each category:

SELECT ARRAY_LENGTH(cat_samples) samples, category, ROUND(100*ARRAY_LENGTH(cat_samples)/c,2) percentage
FROM (
  SELECT ARRAY_AGG(a ORDER BY RAND() LIMIT 20000) cat_samples, category, MAX(c) c
  FROM table a
  JOIN table_stats b
  USING(category)
  GROUP BY category
)

enter image description here

added 428 characters in body
Source Link
Felipe Hoffa
Loading
Source Link
Felipe Hoffa
Loading
lang-sql