@@ -128,39 +128,26 @@ def _execute_query(self, query: str) -> list[dict[str, Any]]:
128128 return results
129129
130130 def _build_count_query (self , days_to_sample : int , years_back : int ) -> str :
131- """Build SQL query that creates temporary table of random days and counts repositories."""
131+ """Build SQL query to count repositories per random day using wildcard tables."""
132+ cutoff_date = (datetime .now () - timedelta (days = 365 * years_back )).strftime (
133+ "%Y%m%d"
134+ )
132135 return f"""
133- -- Define parameters
134- DECLARE days_to_sample INT64 DEFAULT { days_to_sample } ;
135- DECLARE years_back INT64 DEFAULT { years_back } ;
136-
137- -- Create a table of random dates to sample from
138- CREATE TEMP TABLE random_dates AS (
136+ WITH random_dates AS (
139137 SELECT
140138 FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(),
141- INTERVAL CAST(FLOOR(RAND() * (365 * years_back)) AS INT64) DAY)) AS day
142- FROM UNNEST(GENERATE_ARRAY(1, days_to_sample))
143- );
144-
145- -- Count unique repositories per day
139+ INTERVAL CAST(FLOOR(RAND() * (365 * { years_back } )) AS INT64) DAY)) AS day
140+ FROM UNNEST(GENERATE_ARRAY(1, { days_to_sample } ))
141+ )
146142 SELECT
147143 rd.day AS sample_day,
148- COUNT(DISTINCT event.repo_name ) AS repo_count
144+ COUNT(DISTINCT repo.name ) AS repo_count
149145 FROM random_dates rd
150- CROSS JOIN (
151- SELECT repo.name AS repo_name, created_at
152- FROM (
153- EXECUTE IMMEDIATE FORMAT(
154- "SELECT repo.name, created_at
155- FROM `githubarchive.day.%s`
156- WHERE created_at >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL %d YEAR)
157- LIMIT 100000",
158- rd.day,
159- years_back
160- )
161- )
162- ) event
146+ JOIN `githubarchive.day.*` gh
147+ ON _TABLE_SUFFIX = rd.day
148+ AND _TABLE_SUFFIX >= '{ cutoff_date } '
163149 GROUP BY rd.day
150+ HAVING COUNT(DISTINCT repo.name) > 0
164151 ORDER BY repo_count DESC
165152 """
166153
@@ -173,31 +160,19 @@ def _build_day_query(
173160 samples_to_take = day_data .get ("samples_to_take" , 1 )
174161
175162 return f"""
176- -- Day { i + 1 } : { day } with { repo_count } repositories
177163 SELECT DISTINCT
178- event.repo_name AS full_name,
179- SPLIT(event.repo_name , '/')[SAFE_OFFSET(1)] AS name,
180- SPLIT(event.repo_name , '/')[SAFE_OFFSET(0)] AS owner,
181- event.repo_url AS html_url,
182- event. created_at,
164+ repo.name AS full_name,
165+ SPLIT(repo.name , '/')[SAFE_OFFSET(1)] AS name,
166+ SPLIT(repo.name , '/')[SAFE_OFFSET(0)] AS owner,
167+ CONCAT('https://github.com/', repo.name) AS html_url,
168+ created_at,
183169 '{ day } ' AS sampled_from,
184- event. event_type,
170+ type AS event_type,
185171 { repo_count } AS day_repo_count,
186172 { samples_to_take } AS samples_allocated
187- FROM (
188- EXECUTE IMMEDIATE FORMAT(
189- "SELECT
190- repo.name AS repo_name,
191- repo.url AS repo_url,
192- created_at,
193- type AS event_type
194- FROM `githubarchive.day.%s`
195- WHERE created_at >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL %d YEAR)
196- LIMIT 100000",
197- '{ day } ',
198- { years_back }
199- )
200- ) AS event
173+ FROM `githubarchive.day.{ day } `
174+ WHERE repo.name IS NOT NULL
175+ AND repo.name LIKE '%/%'
201176 ORDER BY RAND({ self ._seed } + { i } )
202177 LIMIT { samples_to_take }
203178 """
@@ -206,7 +181,6 @@ def _combine_day_queries(self, day_queries: list[str], n_samples: int) -> str:
206181 """Combine day queries into final query and deduplicate results."""
207182 combined_query = "\n UNION ALL\n " .join (day_queries )
208183 return f"""
209- -- Final combined query with deduplication
210184 SELECT DISTINCT
211185 full_name,
212186 name,
@@ -275,7 +249,7 @@ def sample_by_day(
275249
276250 filtered_count_before = len (valid_repos )
277251 if kwargs :
278- self .results : list [ dict [ str , Any ]] = filter_repos (valid_repos , ** kwargs )
252+ self .results = filter_repos (valid_repos , ** kwargs )
279253 filtered_count_after = len (self .results )
280254 if filtered_count_before != filtered_count_after :
281255 self .logger .info (
@@ -287,7 +261,7 @@ def sample_by_day(
287261 )
288262
289263 if valid_repos :
290- day_counts_map = {}
264+ day_counts_map : dict [ str , dict [ str , int ]] = {}
291265 for repo in valid_repos :
292266 day_sampled = repo .get ("sampled_from" , "unknown" )
293267 day_repo_count = repo .get ("day_repo_count" , 0 )
@@ -302,7 +276,8 @@ def sample_by_day(
302276 self .logger .info (f"Sampled from { len (day_counts_map )} different days" )
303277 for day_str , data in sorted (day_counts_map .items ()):
304278 self .logger .debug (
305- f"Day { day_str } : { data ['count' ]} /{ data ['allocated' ]} samples from { data ['repos' ]} repos"
279+ f"Day { day_str } : { data ['count' ]} /{ data ['allocated' ]} samples "
280+ f"from { data ['repos' ]} repos"
306281 )
307282
308283 return self .results
@@ -315,59 +290,99 @@ def sample_active(
315290 languages : list [str ] | None = None ,
316291 ** kwargs : Any ,
317292 ) -> list [dict [str , Any ]]:
318- """Sample repositories with recent commit activity."""
293+ """Sample repositories with recent commit activity.
294+
295+ Args:
296+ n_samples: Number of repositories to sample
297+ created_after: Filter commits after this timestamp
298+ created_before: Filter commits before this timestamp
299+ languages: List of programming languages to filter by (uses github_repos.languages)
300+ **kwargs: Additional filter criteria
301+
302+ Returns:
303+ List of repository dictionaries
304+ """
319305 self .logger .info (
320306 f"Sampling { n_samples } active repositories based on commit history"
321307 )
322308 if kwargs :
323309 self .logger .info (f"Filter criteria: { kwargs } " )
324310
325311 if created_after :
326- created_after = format_timestamp_query (created_after )
312+ created_after_str = format_timestamp_query (created_after )
327313 else :
328314 one_year_ago = (datetime .now () - timedelta (days = 365 )).strftime ("%Y-%m-%d" )
329- created_after = f"'{ one_year_ago } '"
315+ created_after_str = f"'{ one_year_ago } '"
330316
331317 if created_before :
332- created_before = format_timestamp_query (created_before )
318+ created_before_str = format_timestamp_query (created_before )
333319 else :
334- created_before = "CURRENT_TIMESTAMP()"
320+ created_before_str = "CURRENT_TIMESTAMP()"
335321
336- self .logger .info (f"Time period: { created_after } to { created_before } " )
322+ self .logger .info (f"Time period: { created_after_str } to { created_before_str } " )
337323
338- lang_list = None
339324 if languages :
340325 lang_list = ", " .join ([f"'{ lang } '" for lang in languages ])
341326 self .logger .info (f"Filtering for languages: { lang_list } " )
342-
343- query = f"""
344- WITH repo_set AS (
327+ # Use JOIN with languages table for proper language filtering
328+ query = f"""
329+ WITH repo_set AS (
330+ SELECT DISTINCT
331+ repo AS full_name,
332+ SPLIT(repo, '/')[OFFSET(1)] AS name,
333+ SPLIT(repo, '/')[OFFSET(0)] AS owner
334+ FROM
335+ `bigquery-public-data.github_repos.commits` c,
336+ UNNEST(c.repo_name) AS repo
337+ WHERE
338+ TIMESTAMP_SECONDS(c.committer.time_sec)
339+ BETWEEN TIMESTAMP({ created_after_str } )
340+ AND TIMESTAMP({ created_before_str } )
341+ )
345342 SELECT DISTINCT
346- repo AS full_name,
347- SPLIT(repo, '/')[OFFSET(1)] AS name,
348- SPLIT(repo, '/')[OFFSET(0)] AS owner
349- FROM
350- `bigquery-public-data.github_repos.commits` c,
351- UNNEST(c.repo_name) AS repo
352- WHERE
353- TIMESTAMP_SECONDS(c.committer.time_sec) BETWEEN TIMESTAMP({ created_after } ) AND TIMESTAMP({ created_before } )
354- { ("AND SPLIT(repo, '/')[OFFSET(0)] IN (" + (lang_list or "" ) + ")" ) if languages and lang_list else "" }
355- )
356- SELECT
357- full_name,
358- name,
359- owner
360- FROM
361- repo_set
362- ORDER BY RAND({ self ._seed } )
363- LIMIT { n_samples }
364- """
343+ rs.full_name,
344+ rs.name,
345+ rs.owner,
346+ CONCAT('https://github.com/', rs.full_name) AS html_url
347+ FROM repo_set rs
348+ JOIN `bigquery-public-data.github_repos.languages` l
349+ ON rs.full_name = l.repo_name,
350+ UNNEST(l.language) AS lang
351+ WHERE lang.name IN ({ lang_list } )
352+ ORDER BY RAND({ self ._seed } )
353+ LIMIT { n_samples }
354+ """
355+ else :
356+ query = f"""
357+ WITH repo_set AS (
358+ SELECT DISTINCT
359+ repo AS full_name,
360+ SPLIT(repo, '/')[OFFSET(1)] AS name,
361+ SPLIT(repo, '/')[OFFSET(0)] AS owner
362+ FROM
363+ `bigquery-public-data.github_repos.commits` c,
364+ UNNEST(c.repo_name) AS repo
365+ WHERE
366+ TIMESTAMP_SECONDS(c.committer.time_sec)
367+ BETWEEN TIMESTAMP({ created_after_str } )
368+ AND TIMESTAMP({ created_before_str } )
369+ )
370+ SELECT
371+ full_name,
372+ name,
373+ owner,
374+ CONCAT('https://github.com/', full_name) AS html_url
375+ FROM repo_set
376+ ORDER BY RAND({ self ._seed } )
377+ LIMIT { n_samples }
378+ """
379+
365380 valid_repos = self ._execute_query (query )
366- self .results : list [ dict [ str , Any ]] = valid_repos
381+ self .results = valid_repos
367382
368383 filtered_count_before = len (valid_repos )
369384 if kwargs :
370- self .results : list [ dict [ str , Any ]] = filter_repos (valid_repos , ** kwargs )
385+ self .results = filter_repos (valid_repos , ** kwargs )
371386 filtered_count_after = len (self .results )
372387 if filtered_count_before != filtered_count_after :
373388 self .logger .info (
@@ -387,7 +402,7 @@ def sample(
387402 Args:
388403 n_samples: Number of repositories to sample
389404 population: Type of repository population to sample from ('all' or 'active')
390- **kwargs: Any: Additional filtering criteria
405+ **kwargs: Additional filtering criteria
391406
392407 Returns:
393408 List of repository dictionaries
@@ -397,8 +412,8 @@ def sample(
397412 )
398413 start_time = time .time ()
399414
400- self .attempts : int = 0
401- self .success_count : int = 0
415+ self .attempts = 0
416+ self .success_count = 0
402417
403418 if population == "active" :
404419 self .logger .info ("Targeting active repositories with recent commits" )
@@ -452,11 +467,12 @@ def get_languages(
452467 results = self ._execute_query (query )
453468 query_elapsed = time .time () - query_start_time
454469 self .logger .info (
455- f"Query completed in { query_elapsed :.2f} s: found language data for { len (results )} repositories"
470+ f"Query completed in { query_elapsed :.2f} s: "
471+ f"found language data for { len (results )} repositories"
456472 )
457473
458474 # Process results
459- language_info = {}
475+ language_info : dict [ str , list [ dict [ str , Any ]]] = {}
460476 for result in results :
461477 repo_name = result .get ("repo_name" )
462478 if repo_name and "languages" in result :
@@ -467,18 +483,19 @@ def get_languages(
467483 elapsed_time = time .time () - start_time
468484
469485 self .logger .info (
470- f"Language query completed in { elapsed_time :.2f} s: found data for { repos_with_language } /{ len (repos )} repos"
486+ f"Language query completed in { elapsed_time :.2f} s: "
487+ f"found data for { repos_with_language } /{ len (repos )} repos"
471488 )
472489
473490 # Generate language statistics if data was found
474491 if language_info :
475- all_languages = []
492+ all_languages : list [ str ] = []
476493 for repo_langs in language_info .values ():
477494 for lang_entry in repo_langs :
478495 if "language" in lang_entry :
479496 all_languages .append (lang_entry ["language" ])
480497
481- language_counts = {}
498+ language_counts : dict [ str , int ] = {}
482499 for lang in all_languages :
483500 language_counts [lang ] = language_counts .get (lang , 0 ) + 1
484501
0 commit comments