Skip to content

Commit 86586e1

Browse files
committed
v.4
1 parent f2bc7fb commit 86586e1

9 files changed

Lines changed: 353 additions & 440 deletions

File tree

pyproject.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,8 @@ all = [] # Install with: pip install reporoulette[bigquery,docs,dev]
7070
project = "reporoulette"
7171
author = "Gaurav Sood"
7272
copyright = "2025, Gaurav Sood"
73-
version = "0.3.0"
74-
release = "0.3.0"
73+
version = "0.4.0"
74+
release = "0.4.0"
7575
language = "en"
7676
extensions = [
7777
"sphinx.ext.autodoc",
@@ -129,7 +129,7 @@ markers = [
129129

130130
[tool.ruff]
131131
line-length = 88
132-
target-version = "py311"
132+
target-version = "py312"
133133

134134
[tool.ruff.lint]
135135
select = [

reporoulette/__init__.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,7 @@ def sample(
7575
credentials_path=credentials_path, project_id=project_id
7676
)
7777
else:
78-
error_msg = f"Unknown sampling method: {method}"
79-
logging.error(error_msg)
80-
return {"error": error_msg}
78+
raise ValueError(f"Unknown sampling method: {method}")
8179

8280
# Sample repositories
8381
results = sampler.sample(n_samples=n_samples, **kwargs)

reporoulette/samplers/base.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -108,23 +108,26 @@ def _get_headers(self) -> dict[str, str]:
108108
headers["Authorization"] = f"token {self.token}"
109109
return headers
110110

111-
def _check_rate_limit(self) -> int:
111+
def _check_rate_limit(self, resource: str = "core") -> int:
112112
"""Check GitHub API rate limit and return remaining requests.
113113
114+
Args:
115+
resource: Which rate limit resource to check ("core" or "search")
116+
114117
Returns:
115118
Number of remaining API requests, or 0 if check fails
116119
"""
117120
headers = self._get_headers()
118121

119122
try:
120-
self.logger.debug("Checking GitHub API rate limit")
123+
self.logger.debug(f"Checking GitHub API rate limit for {resource}")
121124
response = requests.get(f"{self.api_base_url}/rate_limit", headers=headers)
122125
if response.status_code == HTTP_OK:
123126
data = response.json()
124-
remaining = data["resources"]["core"]["remaining"]
125-
reset_time = data["resources"]["core"]["reset"]
127+
remaining = data["resources"][resource]["remaining"]
128+
reset_time = data["resources"][resource]["reset"]
126129
self.logger.debug(
127-
f"Rate limit status: {remaining} requests remaining, reset at timestamp {reset_time}"
130+
f"Rate limit status ({resource}): {remaining} requests remaining, reset at timestamp {reset_time}"
128131
)
129132
return remaining
130133
else:

reporoulette/samplers/bigquery_sampler.py

Lines changed: 107 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -128,39 +128,26 @@ def _execute_query(self, query: str) -> list[dict[str, Any]]:
128128
return results
129129

130130
def _build_count_query(self, days_to_sample: int, years_back: int) -> str:
131-
"""Build SQL query that creates temporary table of random days and counts repositories."""
131+
"""Build SQL query to count repositories per random day using wildcard tables."""
132+
cutoff_date = (datetime.now() - timedelta(days=365 * years_back)).strftime(
133+
"%Y%m%d"
134+
)
132135
return f"""
133-
-- Define parameters
134-
DECLARE days_to_sample INT64 DEFAULT {days_to_sample};
135-
DECLARE years_back INT64 DEFAULT {years_back};
136-
137-
-- Create a table of random dates to sample from
138-
CREATE TEMP TABLE random_dates AS (
136+
WITH random_dates AS (
139137
SELECT
140138
FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(),
141-
INTERVAL CAST(FLOOR(RAND() * (365 * years_back)) AS INT64) DAY)) AS day
142-
FROM UNNEST(GENERATE_ARRAY(1, days_to_sample))
143-
);
144-
145-
-- Count unique repositories per day
139+
INTERVAL CAST(FLOOR(RAND() * (365 * {years_back})) AS INT64) DAY)) AS day
140+
FROM UNNEST(GENERATE_ARRAY(1, {days_to_sample}))
141+
)
146142
SELECT
147143
rd.day AS sample_day,
148-
COUNT(DISTINCT event.repo_name) AS repo_count
144+
COUNT(DISTINCT repo.name) AS repo_count
149145
FROM random_dates rd
150-
CROSS JOIN (
151-
SELECT repo.name AS repo_name, created_at
152-
FROM (
153-
EXECUTE IMMEDIATE FORMAT(
154-
"SELECT repo.name, created_at
155-
FROM `githubarchive.day.%s`
156-
WHERE created_at >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL %d YEAR)
157-
LIMIT 100000",
158-
rd.day,
159-
years_back
160-
)
161-
)
162-
) event
146+
JOIN `githubarchive.day.*` gh
147+
ON _TABLE_SUFFIX = rd.day
148+
AND _TABLE_SUFFIX >= '{cutoff_date}'
163149
GROUP BY rd.day
150+
HAVING COUNT(DISTINCT repo.name) > 0
164151
ORDER BY repo_count DESC
165152
"""
166153

@@ -173,31 +160,19 @@ def _build_day_query(
173160
samples_to_take = day_data.get("samples_to_take", 1)
174161

175162
return f"""
176-
-- Day {i + 1}: {day} with {repo_count} repositories
177163
SELECT DISTINCT
178-
event.repo_name AS full_name,
179-
SPLIT(event.repo_name, '/')[SAFE_OFFSET(1)] AS name,
180-
SPLIT(event.repo_name, '/')[SAFE_OFFSET(0)] AS owner,
181-
event.repo_url AS html_url,
182-
event.created_at,
164+
repo.name AS full_name,
165+
SPLIT(repo.name, '/')[SAFE_OFFSET(1)] AS name,
166+
SPLIT(repo.name, '/')[SAFE_OFFSET(0)] AS owner,
167+
CONCAT('https://github.com/', repo.name) AS html_url,
168+
created_at,
183169
'{day}' AS sampled_from,
184-
event.event_type,
170+
type AS event_type,
185171
{repo_count} AS day_repo_count,
186172
{samples_to_take} AS samples_allocated
187-
FROM (
188-
EXECUTE IMMEDIATE FORMAT(
189-
"SELECT
190-
repo.name AS repo_name,
191-
repo.url AS repo_url,
192-
created_at,
193-
type AS event_type
194-
FROM `githubarchive.day.%s`
195-
WHERE created_at >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL %d YEAR)
196-
LIMIT 100000",
197-
'{day}',
198-
{years_back}
199-
)
200-
) AS event
173+
FROM `githubarchive.day.{day}`
174+
WHERE repo.name IS NOT NULL
175+
AND repo.name LIKE '%/%'
201176
ORDER BY RAND({self._seed} + {i})
202177
LIMIT {samples_to_take}
203178
"""
@@ -206,7 +181,6 @@ def _combine_day_queries(self, day_queries: list[str], n_samples: int) -> str:
206181
"""Combine day queries into final query and deduplicate results."""
207182
combined_query = "\nUNION ALL\n".join(day_queries)
208183
return f"""
209-
-- Final combined query with deduplication
210184
SELECT DISTINCT
211185
full_name,
212186
name,
@@ -275,7 +249,7 @@ def sample_by_day(
275249

276250
filtered_count_before = len(valid_repos)
277251
if kwargs:
278-
self.results: list[dict[str, Any]] = filter_repos(valid_repos, **kwargs)
252+
self.results = filter_repos(valid_repos, **kwargs)
279253
filtered_count_after = len(self.results)
280254
if filtered_count_before != filtered_count_after:
281255
self.logger.info(
@@ -287,7 +261,7 @@ def sample_by_day(
287261
)
288262

289263
if valid_repos:
290-
day_counts_map = {}
264+
day_counts_map: dict[str, dict[str, int]] = {}
291265
for repo in valid_repos:
292266
day_sampled = repo.get("sampled_from", "unknown")
293267
day_repo_count = repo.get("day_repo_count", 0)
@@ -302,7 +276,8 @@ def sample_by_day(
302276
self.logger.info(f"Sampled from {len(day_counts_map)} different days")
303277
for day_str, data in sorted(day_counts_map.items()):
304278
self.logger.debug(
305-
f"Day {day_str}: {data['count']}/{data['allocated']} samples from {data['repos']} repos"
279+
f"Day {day_str}: {data['count']}/{data['allocated']} samples "
280+
f"from {data['repos']} repos"
306281
)
307282

308283
return self.results
@@ -315,59 +290,99 @@ def sample_active(
315290
languages: list[str] | None = None,
316291
**kwargs: Any,
317292
) -> list[dict[str, Any]]:
318-
"""Sample repositories with recent commit activity."""
293+
"""Sample repositories with recent commit activity.
294+
295+
Args:
296+
n_samples: Number of repositories to sample
297+
created_after: Filter commits after this timestamp
298+
created_before: Filter commits before this timestamp
299+
languages: List of programming languages to filter by (uses github_repos.languages)
300+
**kwargs: Additional filter criteria
301+
302+
Returns:
303+
List of repository dictionaries
304+
"""
319305
self.logger.info(
320306
f"Sampling {n_samples} active repositories based on commit history"
321307
)
322308
if kwargs:
323309
self.logger.info(f"Filter criteria: {kwargs}")
324310

325311
if created_after:
326-
created_after = format_timestamp_query(created_after)
312+
created_after_str = format_timestamp_query(created_after)
327313
else:
328314
one_year_ago = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")
329-
created_after = f"'{one_year_ago}'"
315+
created_after_str = f"'{one_year_ago}'"
330316

331317
if created_before:
332-
created_before = format_timestamp_query(created_before)
318+
created_before_str = format_timestamp_query(created_before)
333319
else:
334-
created_before = "CURRENT_TIMESTAMP()"
320+
created_before_str = "CURRENT_TIMESTAMP()"
335321

336-
self.logger.info(f"Time period: {created_after} to {created_before}")
322+
self.logger.info(f"Time period: {created_after_str} to {created_before_str}")
337323

338-
lang_list = None
339324
if languages:
340325
lang_list = ", ".join([f"'{lang}'" for lang in languages])
341326
self.logger.info(f"Filtering for languages: {lang_list}")
342-
343-
query = f"""
344-
WITH repo_set AS (
327+
# Use JOIN with languages table for proper language filtering
328+
query = f"""
329+
WITH repo_set AS (
330+
SELECT DISTINCT
331+
repo AS full_name,
332+
SPLIT(repo, '/')[OFFSET(1)] AS name,
333+
SPLIT(repo, '/')[OFFSET(0)] AS owner
334+
FROM
335+
`bigquery-public-data.github_repos.commits` c,
336+
UNNEST(c.repo_name) AS repo
337+
WHERE
338+
TIMESTAMP_SECONDS(c.committer.time_sec)
339+
BETWEEN TIMESTAMP({created_after_str})
340+
AND TIMESTAMP({created_before_str})
341+
)
345342
SELECT DISTINCT
346-
repo AS full_name,
347-
SPLIT(repo, '/')[OFFSET(1)] AS name,
348-
SPLIT(repo, '/')[OFFSET(0)] AS owner
349-
FROM
350-
`bigquery-public-data.github_repos.commits` c,
351-
UNNEST(c.repo_name) AS repo
352-
WHERE
353-
TIMESTAMP_SECONDS(c.committer.time_sec) BETWEEN TIMESTAMP({created_after}) AND TIMESTAMP({created_before})
354-
{("AND SPLIT(repo, '/')[OFFSET(0)] IN (" + (lang_list or "") + ")") if languages and lang_list else ""}
355-
)
356-
SELECT
357-
full_name,
358-
name,
359-
owner
360-
FROM
361-
repo_set
362-
ORDER BY RAND({self._seed})
363-
LIMIT {n_samples}
364-
"""
343+
rs.full_name,
344+
rs.name,
345+
rs.owner,
346+
CONCAT('https://github.com/', rs.full_name) AS html_url
347+
FROM repo_set rs
348+
JOIN `bigquery-public-data.github_repos.languages` l
349+
ON rs.full_name = l.repo_name,
350+
UNNEST(l.language) AS lang
351+
WHERE lang.name IN ({lang_list})
352+
ORDER BY RAND({self._seed})
353+
LIMIT {n_samples}
354+
"""
355+
else:
356+
query = f"""
357+
WITH repo_set AS (
358+
SELECT DISTINCT
359+
repo AS full_name,
360+
SPLIT(repo, '/')[OFFSET(1)] AS name,
361+
SPLIT(repo, '/')[OFFSET(0)] AS owner
362+
FROM
363+
`bigquery-public-data.github_repos.commits` c,
364+
UNNEST(c.repo_name) AS repo
365+
WHERE
366+
TIMESTAMP_SECONDS(c.committer.time_sec)
367+
BETWEEN TIMESTAMP({created_after_str})
368+
AND TIMESTAMP({created_before_str})
369+
)
370+
SELECT
371+
full_name,
372+
name,
373+
owner,
374+
CONCAT('https://github.com/', full_name) AS html_url
375+
FROM repo_set
376+
ORDER BY RAND({self._seed})
377+
LIMIT {n_samples}
378+
"""
379+
365380
valid_repos = self._execute_query(query)
366-
self.results: list[dict[str, Any]] = valid_repos
381+
self.results = valid_repos
367382

368383
filtered_count_before = len(valid_repos)
369384
if kwargs:
370-
self.results: list[dict[str, Any]] = filter_repos(valid_repos, **kwargs)
385+
self.results = filter_repos(valid_repos, **kwargs)
371386
filtered_count_after = len(self.results)
372387
if filtered_count_before != filtered_count_after:
373388
self.logger.info(
@@ -387,7 +402,7 @@ def sample(
387402
Args:
388403
n_samples: Number of repositories to sample
389404
population: Type of repository population to sample from ('all' or 'active')
390-
**kwargs: Any: Additional filtering criteria
405+
**kwargs: Additional filtering criteria
391406
392407
Returns:
393408
List of repository dictionaries
@@ -397,8 +412,8 @@ def sample(
397412
)
398413
start_time = time.time()
399414

400-
self.attempts: int = 0
401-
self.success_count: int = 0
415+
self.attempts = 0
416+
self.success_count = 0
402417

403418
if population == "active":
404419
self.logger.info("Targeting active repositories with recent commits")
@@ -452,11 +467,12 @@ def get_languages(
452467
results = self._execute_query(query)
453468
query_elapsed = time.time() - query_start_time
454469
self.logger.info(
455-
f"Query completed in {query_elapsed:.2f}s: found language data for {len(results)} repositories"
470+
f"Query completed in {query_elapsed:.2f}s: "
471+
f"found language data for {len(results)} repositories"
456472
)
457473

458474
# Process results
459-
language_info = {}
475+
language_info: dict[str, list[dict[str, Any]]] = {}
460476
for result in results:
461477
repo_name = result.get("repo_name")
462478
if repo_name and "languages" in result:
@@ -467,18 +483,19 @@ def get_languages(
467483
elapsed_time = time.time() - start_time
468484

469485
self.logger.info(
470-
f"Language query completed in {elapsed_time:.2f}s: found data for {repos_with_language}/{len(repos)} repos"
486+
f"Language query completed in {elapsed_time:.2f}s: "
487+
f"found data for {repos_with_language}/{len(repos)} repos"
471488
)
472489

473490
# Generate language statistics if data was found
474491
if language_info:
475-
all_languages = []
492+
all_languages: list[str] = []
476493
for repo_langs in language_info.values():
477494
for lang_entry in repo_langs:
478495
if "language" in lang_entry:
479496
all_languages.append(lang_entry["language"])
480497

481-
language_counts = {}
498+
language_counts: dict[str, int] = {}
482499
for lang in all_languages:
483500
language_counts[lang] = language_counts.get(lang, 0) + 1
484501

reporoulette/samplers/gh_sampler.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,8 +209,7 @@ def gh_sampler(
209209
"full_name": repo_name,
210210
"name": name,
211211
"owner": owner,
212-
"html_url": repo.get("url")
213-
or f"https://github.com/{repo_name}",
212+
"html_url": f"https://github.com/{repo_name}",
214213
"created_at": event.get("created_at"),
215214
"sampled_from": day_str,
216215
"event_type": event.get("type"),

0 commit comments

Comments
 (0)