Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from collectoss.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException
from collectoss.application.db.models import ContributorRepo
from collectoss.application.db.lib import bulk_insert_dicts
from collectoss.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth

### This worker scans all the platform users in CollectOSS, and pulls their platform activity
### logs. Those are then used to analyze what repos each is working in (which will include repos not
Expand All @@ -26,8 +25,6 @@ def contributor_breadth_model(self) -> None:
tool_version = '0.0.1'
data_source = 'GitHub API'

key_auth = GithubRandomKeyAuth(logger)

# This version of the query pulls contributors who have not had any data collected yet
# To the top of the list
cntrb_login_query = s.sql.text("""
Expand Down Expand Up @@ -83,7 +80,7 @@ def contributor_breadth_model(self) -> None:

cntrb_newest_events_map[gh_login] = newest_event_date

github_data_access = GithubDataAccess(key_auth, logger)
github_data_access = GithubDataAccess(None, logger)

index = 1
total = len(current_cntrb_logins)
Expand Down
4 changes: 2 additions & 2 deletions collectoss/tasks/frontend.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def get_org_repo_data(orgs, session):
def add_new_github_repos(repo_data, group_id, session, logger):

# get data for repos to determine type, src id, and if they exist
data = get_github_repos_data(repo_data, session, logger)
data = get_github_repos_data(repo_data, None, logger)

for url, repo_group_id in repo_data:

Expand Down Expand Up @@ -200,7 +200,7 @@ def get_github_repos_data(repo_data, session, logger):

repo_urls = [x[0] for x in repo_data]

github_graphql_data_access = GithubGraphQlDataAccess(session.oauths, logger, ingore_not_found_error=True)
github_graphql_data_access = GithubGraphQlDataAccess(None, logger, ingore_not_found_error=True)

query_parts = []
repo_map = {}
Expand Down
4 changes: 2 additions & 2 deletions collectoss/tasks/github/contributors.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def process_contributors():



@deprecated("This function is deprecated. Use the GithubDataAccess class instead")
def retrieve_dict_data(url: str, key_auth, logger):

num_attempts = 0
Expand Down Expand Up @@ -133,8 +134,7 @@ def grab_comitters(self, repo_git,platform="github"):
logger = logging.getLogger(grab_comitters.__name__)

try:
key_auth = GithubRandomKeyAuth(logger)
grab_committer_list(logger, key_auth, repo_git, tool_source, tool_version, data_source, platform)
grab_committer_list(logger, None, repo_git, tool_source, tool_version, data_source, platform)

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pylint] reported by reviewdog 🐶
E0602: Undefined variable 'grab_committer_list' (undefined-variable)

except Exception as e:
logger.error(f"Could not grab committers from github endpoint!\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}")

10 changes: 5 additions & 5 deletions collectoss/tasks/github/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def collect_events(repo_git: str, full_collection: bool):

key_auth = GithubRandomKeyAuth(logger)

if bulk_events_collection_endpoint_contains_all_data(key_auth, logger, owner, repo):
if bulk_events_collection_endpoint_contains_all_data(None, logger, owner, repo):
collection_strategy = BulkGithubEventCollection(logger)
else:
collection_strategy = ThoroughGithubEventCollection(logger)
Expand All @@ -50,7 +50,7 @@ def bulk_events_collection_endpoint_contains_all_data(key_auth, logger, owner, r

url = f"https://api.github.com/repos/{owner}/{repo}/issues/events?per_page=100"

github_data_access = GithubDataAccess(key_auth, logger)
github_data_access = GithubDataAccess(None, logger)

page_count = github_data_access.get_resource_page_count(url)

Expand Down Expand Up @@ -136,7 +136,7 @@ def _collect_events(self, repo_git: str, key_auth, since):

url = f"https://api.github.com/repos/{owner}/{repo}/issues/events"

github_data_access = GithubDataAccess(key_auth, self._logger)
github_data_access = GithubDataAccess(None, self._logger)

for event in github_data_access.paginate_resource(url):

Expand Down Expand Up @@ -308,7 +308,7 @@ def _collect_and_process_issue_events(self, owner, repo, repo_id, key_auth, sinc

events = []
contributors = []
github_data_access = GithubDataAccess(key_auth, self._logger)
github_data_access = GithubDataAccess(None, self._logger)
for db_issue in issue_result:
issue = db_issue._asdict()

Expand Down Expand Up @@ -371,7 +371,7 @@ def _collect_and_process_pr_events(self, owner, repo, repo_id, key_auth, since):

events = []
contributors = []
github_data_access = GithubDataAccess(key_auth, self._logger)
github_data_access = GithubDataAccess(None, self._logger)
for db_pr in pr_result:
pr = db_pr._asdict()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ def fetch_username_from_email(logger, auth, commit) -> dict | None:
return login_json

try:
github_data_access = GithubDataAccess(auth, logger, feature="search")
github_data_access = GithubDataAccess(None, logger, feature="search")
login_json = github_data_access.get_resource(url)
except Exception as e:
logger.error(f"Couldn't resolve email URL with given data. Reason: {e}")
Expand Down Expand Up @@ -328,7 +328,7 @@ def get_login_with_supplemental_data(logger, auth, commit_data):

# Try to get login from all possible emails
# Is None upon failure.
login_json = fetch_username_from_email(logger,auth,commit_data)
login_json = fetch_username_from_email(logger,None,commit_data)

# total_count is the count of username's found by the endpoint.
# This Checks if the email result got anything.
Expand Down Expand Up @@ -367,7 +367,7 @@ def get_login_with_commit_hash(logger, auth, commit_data, repo_id):

#TODO: here.
# Send api request
github_data_access = GithubDataAccess(auth, logger)
github_data_access = GithubDataAccess(None, logger)
login_json = github_data_access.get_resource(url)

# TODO: Why are we returning None if 'sha' is not in response if we aren't even using it?
Expand Down
4 changes: 2 additions & 2 deletions collectoss/tasks/github/facade_github/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def query_github_contributors(logger, key_auth, github_url, tool_source:str, too
update_col_map = {'cntrb_email': 'email'}
duplicate_col_map = {'cntrb_login': 'login'}

github_data_access = GithubDataAccess(key_auth, logger)
github_data_access = GithubDataAccess(None, logger)

contributor_count = github_data_access.get_resource_count(contributors_url)

Expand Down Expand Up @@ -79,5 +79,5 @@ def query_github_contributors(logger, key_auth, github_url, tool_source:str, too
def grab_committer_list(logger, key_auth, repo_git, tool_source: str, tool_version: str, data_source: str, platform="github" ):

# Create API endpoint from repo_id
query_github_contributors(logger, key_auth, repo_git, tool_source, tool_version, data_source)
query_github_contributors(logger, None, repo_git, tool_source, tool_version, data_source)

12 changes: 5 additions & 7 deletions collectoss/tasks/github/facade_github/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id, tool_source:str, tool_version:str, data_source:str):

github_data_access = GithubDataAccess(auth, logger)
github_data_access = GithubDataAccess(None, logger)

for contributor in contributorQueue:
# Get the email from the commit data
Expand Down Expand Up @@ -63,12 +63,12 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id

# Try to get the login from the commit sha
if login == None or login == "":
login = get_login_with_commit_hash(logger, auth, contributor, repo_id)
login = get_login_with_commit_hash(logger, None, contributor, repo_id)

if login == None or login == "":
logger.warning("Failed to get login from commit hash")
# Try to get the login from supplemental data if not found with the commit hash
login = get_login_with_supplemental_data(logger, auth,contributor)
login = get_login_with_supplemental_data(logger, None,contributor)

if login == None or login == "":
logger.error("Failed to get login from supplemental data!")
Expand Down Expand Up @@ -229,8 +229,6 @@ def insert_facade_contributors(self, repo_git):
# 'repo_id': repo_id}).to_json(orient="records"))


key_auth = GithubRandomKeyAuth(logger)

facade_batch_size = get_batch_size()

# Process results in batches to reduce memory usage
Expand All @@ -240,12 +238,12 @@ def insert_facade_contributors(self, repo_git):
batch.append(dict(row))

if len(batch) >= facade_batch_size:
process_commit_metadata(logger, key_auth, batch, repo_id, platform_id, tool_source, tool_version, data_source)
process_commit_metadata(logger, None, batch, repo_id, platform_id, tool_source, tool_version, data_source)
batch.clear()

# Process remaining items in batch
if batch:
process_commit_metadata(logger, key_auth, batch, repo_id, platform_id, tool_source, tool_version, data_source)
process_commit_metadata(logger, None, batch, repo_id, platform_id, tool_source, tool_version, data_source)

logger.debug("DEBUG: Got through the new_contribs")

Expand Down
13 changes: 4 additions & 9 deletions collectoss/tasks/github/issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from collectoss.tasks.init.celery_app import CoreRepoCollectionTask
from collectoss.application.db.data_parse import *
from collectoss.tasks.github.util.github_data_access import GithubDataAccess
from collectoss.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth
from collectoss.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo
from collectoss.tasks.util.worker_util import remove_duplicate_dicts
from collectoss.application.db.models import Issue, IssueLabel, IssueAssignee
Expand Down Expand Up @@ -47,12 +46,8 @@ def collect_issues(repo_git: str, full_collection: bool) -> int:
# Subtract 2 days to ensure all data is collected
core_data_last_collected = (get_core_data_last_collected(repo_id) - timedelta(days=2)).replace(tzinfo=timezone.utc)

key_auth = GithubRandomKeyAuth(logger)

logger.info(f'this is the manifest.key_auth value: {str(key_auth)}')

try:
issue_data_generator = retrieve_all_issue_data(repo_git, logger, key_auth, core_data_last_collected)
issue_data_generator = retrieve_all_issue_data(repo_git, logger, None, core_data_last_collected)

issue_batch_size = get_batch_size()

Expand Down Expand Up @@ -86,7 +81,7 @@ def collect_issues(repo_git: str, full_collection: bool) -> int:



def retrieve_all_issue_data(repo_git: str, logger: logging.Logger, key_auth: GithubRandomKeyAuth, since: datetime | None = None):
def retrieve_all_issue_data(repo_git: str, logger: logging.Logger, key_auth: None, since: datetime | None = None):
"""
Retrieve all issue data for a repository as a generator.

Expand All @@ -96,7 +91,7 @@ def retrieve_all_issue_data(repo_git: str, logger: logging.Logger, key_auth: Git
Args:
repo_git (str): The GitHub repository in "owner/repo" format.
logger (logging.Logger): Logger for logging messages.
key_auth (GithubRandomKeyAuth): Auth handler for GitHub API.
key_auth (GithubRandomKeyAuth): Auth handler for GitHub API. unused and deprecated, use KeyClient instead.
since (datetime, optional): Only issues updated since this datetime will be retrieved.
"""
owner, repo = get_owner_repo(repo_git)
Expand All @@ -108,7 +103,7 @@ def retrieve_all_issue_data(repo_git: str, logger: logging.Logger, key_auth: Git
if since:
url += f"&since={since.isoformat()}"

github_data_access = GithubDataAccess(key_auth, logger)
github_data_access = GithubDataAccess(None, logger)

num_pages = github_data_access.get_resource_page_count(url)
logger.info(f"{owner}/{repo}: Retrieving {num_pages} pages of issues")
Expand Down
8 changes: 4 additions & 4 deletions collectoss/tasks/github/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def collect_github_messages(repo_git: str, full_collection: bool) -> None:


if is_repo_small(repo_id):
message_data = fast_retrieve_all_pr_and_issue_messages(repo_git, logger, manifest.key_auth, task_name, core_data_last_collected)
message_data = fast_retrieve_all_pr_and_issue_messages(repo_git, logger, None, task_name, core_data_last_collected)

if message_data:
process_messages(message_data, task_name, repo_id, logger, db_session)
Expand All @@ -49,7 +49,7 @@ def collect_github_messages(repo_git: str, full_collection: bool) -> None:
logger.info(f"{owner}/{repo} has no messages")

else:
process_large_issue_and_pr_message_collection(repo_id, repo_git, logger, manifest.key_auth, task_name, db_session, core_data_last_collected)
process_large_issue_and_pr_message_collection(repo_id, repo_git, logger, None, task_name, db_session, core_data_last_collected)


def is_repo_small(repo_id):
Expand All @@ -73,7 +73,7 @@ def fast_retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, tas
# define logger for task
logger.info(f"Collecting github comments for {owner}/{repo}")

github_data_access = GithubDataAccess(key_auth, logger)
github_data_access = GithubDataAccess(None, logger)

message_count = github_data_access.get_resource_count(url)

Expand Down Expand Up @@ -113,7 +113,7 @@ def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger
result = connection.execute(query).fetchall()
comment_urls = [x[0] for x in result if x[0] is not None]

github_data_access = GithubDataAccess(key_auth, logger)
github_data_access = GithubDataAccess(None, logger)

logger.info(f"{task_name}: Collecting github messages for {len(comment_urls)} prs/issues")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def pull_request_commits_model(repo_id,logger, db_session, key_auth, full_collec

logger.info(f"Getting pull request commits for repo: {repo.repo_git}")

github_data_access = GithubDataAccess(key_auth, logger)
github_data_access = GithubDataAccess(None, logger)

pr_commits_natural_keys = ["pull_request_id", "repo_id", "pr_cmt_sha"]
all_data = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ def process_pull_request_commits(repo_git: str, full_collection: bool) -> None:

with GithubTaskManifest(logger) as manifest:

pull_request_commits_model(repo.repo_id, logger, manifest.db_session, manifest.key_auth, full_collection)
pull_request_commits_model(repo.repo_id, logger, manifest.db_session, None, full_collection)
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def pull_request_files_model(repo_id,logger, db_session, key_auth, full_collecti

task_name = f"{owner}/{name} Pr files"

github_graphql_data_access = GithubGraphQlDataAccess(key_auth, logger)
github_graphql_data_access = GithubGraphQlDataAccess(None, logger)

pr_file_natural_keys = ["pull_request_id", "repo_id", "pr_file_path"]
pr_file_rows = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ def process_pull_request_files(repo_git: str, full_collection: bool) -> None:
query = db_session.session.query(Repo).filter(Repo.repo_git == repo_git)
repo = execute_session_query(query, 'one')

pull_request_files_model(repo.repo_id, logger, db_session, manifest.key_auth, full_collection)
pull_request_files_model(repo.repo_id, logger, db_session, None, full_collection)
10 changes: 4 additions & 6 deletions collectoss/tasks/github/pull_requests/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from collectoss.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo
from collectoss.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Contributor, Repo
from collectoss.tasks.github.util.github_task_session import GithubTaskManifest
from collectoss.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth
from collectoss.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_pull_request_reviews_by_repo_id, batch_insert_contributors, get_batch_size
from collectoss.application.db.util import execute_session_query
from ..messages import process_github_comment_contributors
Expand Down Expand Up @@ -46,7 +45,7 @@ def collect_pull_requests(repo_git: str, full_collection: bool) -> int:

total_count = 0
all_data = []
for pr in retrieve_all_pr_data(repo_git, logger, manifest.key_auth, core_data_last_collected):
for pr in retrieve_all_pr_data(repo_git, logger, None, core_data_last_collected):

all_data.append(pr)

Expand Down Expand Up @@ -75,7 +74,7 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth, since): #-> Generator[

logger.debug(f"Collecting pull requests for {owner}/{repo}")

github_data_access = GithubDataAccess(key_auth, logger)
github_data_access = GithubDataAccess(None, logger)

search_args = {"state": "all", "direction": "desc", "sort": "updated"}
url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/pulls", search_args)
Expand Down Expand Up @@ -257,8 +256,7 @@ def collect_pull_request_review_comments(repo_git: str, full_collection: bool) -
tool_version = "2.0"
data_source = "Github API"

key_auth = GithubRandomKeyAuth(logger)
github_data_access = GithubDataAccess(key_auth, logger)
github_data_access = GithubDataAccess(None, logger)

pr_review_comment_batch_size = get_batch_size()

Expand Down Expand Up @@ -495,7 +493,7 @@ def collect_pull_request_reviews(repo_git: str, full_collection: bool) -> None:

logger.info(f"{owner}/{repo}: Collecting reviews for {pr_count} PRs")

github_data_access = GithubDataAccess(manifest.key_auth, logger)
github_data_access = GithubDataAccess(None, logger)

pr_review_batch_size = get_batch_size()

Expand Down
4 changes: 2 additions & 2 deletions collectoss/tasks/github/releases/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from collectoss.tasks.github.util.gh_graphql_entities import request_graphql_dict
from collectoss.application.db.util import execute_session_query
from collectoss.application.db.lib import bulk_insert_dicts

from typing_extensions import deprecated

def get_release_inf(repo_id, release, tag_only):
if not tag_only:
Expand Down Expand Up @@ -153,7 +153,7 @@ def get_query(logger, owner, repo, tag_only):
return query



@deprecated("This function is deprecated. Use the GithubGraphQlDataAccess class instead")
def fetch_data(key_auth, logger, github_url, repo_id, tag_only = False):

logger.info("Beginning filling the releases model for repo: " + github_url + "\n")
Expand Down
Loading
Loading