Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
1d7c3e6
initial stub of SystemPaths
MoralCode Jun 23, 2026
306ba82
helpers for processing paths
MoralCode Jun 23, 2026
32c7969
docs and setup
MoralCode Jun 23, 2026
69fc550
add create args to all path functions
MoralCode Jun 23, 2026
8ff9642
create general function for assembling paths so logic is consistent b…
MoralCode Jun 23, 2026
cae9a1f
populate facade repo directory
MoralCode Jun 23, 2026
3e5d0d5
populate config directory
MoralCode Jun 23, 2026
1f81748
populate logs directory
MoralCode Jun 23, 2026
bc47335
populate cache directory
MoralCode Jun 23, 2026
2bc4b04
add function to print all paths
MoralCode Jun 23, 2026
c95a740
make SystemPaths static
MoralCode Jun 23, 2026
80fc2fd
simplify inclusion of home directory path
MoralCode Jun 23, 2026
1abd230
unit testing and fixes for core path builder logic
MoralCode Jun 23, 2026
e20ed8b
print system paths on startup
MoralCode Jun 23, 2026
0bf3c9b
don't write facade and log dirs to the database if they aren't provid…
MoralCode Jun 24, 2026
84980bd
type tweaks for accuracy/documentation
MoralCode Jun 24, 2026
d0e889e
use logging for paths class
MoralCode Jun 24, 2026
75830bb
read from database for facade and logs directories
MoralCode Jun 24, 2026
ac9c878
Replace uses of ROOT_PROJECT_REPO_DIRECTORY used for logging
MoralCode Jun 24, 2026
4cc2178
add models and discourse analysis paths to the SystemPaths object
MoralCode Jun 24, 2026
e942c82
add install path to SystemPaths
MoralCode Jun 24, 2026
1a7b8af
allow the analysis directories to be somewhat controllable with an en…
MoralCode Jun 24, 2026
ef3de93
replace all remaining uses of ROOT_PROJECT_REPO_DIRECTORY with a call…
MoralCode Jun 24, 2026
ced6645
use SystemPaths for remaining log and facade directory-building too
MoralCode Jun 24, 2026
aa7699b
swap path in facade helper
MoralCode Jun 24, 2026
dd0557b
deprecate get_absolute_repo_path in favor of SystemPaths.facade_repo_…
MoralCode Jun 24, 2026
8c5675b
fix syntax for path printing on startup
MoralCode Jun 24, 2026
24be1e9
remove unused imports suggested by reviewdog
MoralCode Jun 24, 2026
be9e1ec
ensure the path is getting resolved the same way in startup.py
MoralCode Jun 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 2 additions & 9 deletions collectoss/api/gunicorn_conf.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,16 @@
# from collectoss import ROOT_PROJECT_REPO_DIRECTORY
import multiprocessing
import logging
import os
from pathlib import Path
from glob import glob

from collectoss.application.db.lib import get_value
from collectoss.application.db import dispose_database_engine
from collectoss.application.environment import SystemEnv
from collectoss.application.paths import SystemPaths

logger = logging.getLogger(__name__)


# ROOT_PROJECT_REPO_DIRECTORY = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))

# base_log_dir = ROOT_PROJECT_REPO_DIRECTORY + "/logs/"

# Path(base_log_dir).mkdir(exist_ok=True)

workers = multiprocessing.cpu_count() * 2 + 1
umask = 0o007
reload = True
Expand All @@ -39,7 +32,7 @@
del is_dev

# set the log location for gunicorn
logs_directory = get_value('Logging', 'logs_directory')
logs_directory = SystemPaths.get_logs_directory()

# this syntax satisfies the type checker
is_docker = SystemEnv.get_bool("AUGUR_DOCKER_DEPLOY", False)
Expand Down
3 changes: 2 additions & 1 deletion collectoss/application/cli/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import uuid
import traceback
import requests
from collectoss.application.paths import SystemPaths
from redis.exceptions import ConnectionError as RedisConnectionError
Comment thread
MoralCode marked this conversation as resolved.

from collectoss.application.environment import SystemEnv
Expand Down Expand Up @@ -101,7 +102,7 @@ def start(ctx, disable_collection, development, pidfile, port):
cleanup_collection_status_and_rabbit(logger, ctx.obj.engine)

# Retrieve the log directory from the configuration or default to current directory
log_dir = get_value("Logging", "logs_directory") or "."
log_dir = SystemPaths.get_logs_directory()
gunicorn_log_file = os.path.join(log_dir, "gunicorn.log")

gunicorn_command = f"gunicorn -c {gunicorn_location} -b {host}:{port} collectoss.api.server:app --log-file {gunicorn_log_file}"
Expand Down
16 changes: 7 additions & 9 deletions collectoss/application/logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,14 @@
import os
from pathlib import Path
import shutil
from collectoss.application.paths import SystemPaths
import coloredlogs
from sqlalchemy.orm import Session

from collectoss.application.db.models import Config
from collectoss.application.config import convert_type_of_value
from collectoss.application.db.util import execute_session_query

ROOT_PROJECT_REPO_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))


SIMPLE_FORMAT_STRING = "[%(process)d] %(name)s [%(levelname)s] %(message)s"
VERBOSE_FORMAT_STRING = "%(asctime)s,%(msecs)dms [PID: %(process)d] %(name)s [%(levelname)s] %(message)s"
CLI_FORMAT_STRING = "CLI: [%(module)s.%(funcName)s] [%(levelname)s] %(message)s"
Expand Down Expand Up @@ -117,12 +115,12 @@ def get_log_config():

#TODO dynamically define loggers for every task names.
class TaskLogConfig():
def __init__(self, all_tasks, disable_log_files=False,reset_logfiles=False,base_log_dir=ROOT_PROJECT_REPO_DIRECTORY + "/logs/"):
def __init__(self, all_tasks, disable_log_files=False, reset_logfiles=False, base_log_dir=None):

log_config = get_log_config()

if log_config["logs_directory"] != "":
base_log_dir=log_config["logs_directory"]
if not base_log_dir:
base_log_dir = SystemPaths.get_logs_directory()

if reset_logfiles is True:
try:
Expand Down Expand Up @@ -188,12 +186,12 @@ def getLoggerNames(self):


class SystemLogger():
def __init__(self, logger_name, disable_log_files=False,reset_logfiles=False,base_log_dir=ROOT_PROJECT_REPO_DIRECTORY + "/logs/"):
def __init__(self, logger_name, disable_log_files=False, reset_logfiles=False, base_log_dir=None):

log_config = get_log_config()

if log_config.get("logs_directory", "") != "":
base_log_dir=log_config.get("logs_directory")
if not base_log_dir:
base_log_dir = SystemPaths.get_logs_directory()

if reset_logfiles is True:
try:
Expand Down
181 changes: 181 additions & 0 deletions collectoss/application/paths.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
from platformdirs import PlatformDirs
from collectoss.application.environment import SystemEnv
from pathlib import Path

import logging

logger = logging.getLogger(__name__)

def _clean_path(path: Path | str) -> Path | None:
if path is None:
return None
if isinstance(path, str):
path = Path(path)
return path.expanduser().resolve()

def _verify_path(path: Path, create = True) -> Path | None:
"""Verify the path is a valid directory"""
if create:
if not path.exists():
path.mkdir(parents=True)
if not path.is_dir():
raise ValueError(f"Path {path} is not a valid directory")
return _clean_path(path)


def _path_from_env(env_value: str) -> Path | None:
"""Get the path from the environment variable"""
if env_value is None:
return None
if env_value == "":
return None
return Path(env_value)

def _build_path(env_path:str, default_path:Path) -> Path:
"""Build a path from the environment variable or the default path.

If the environment variable is an absolute path, return it.
If the environment variable is a relative path, resolve it against the home directory.
If the environment variable is not set, return the default path.
"""
if env_path is not None:
env_path = Path(env_path)
if env_path.is_absolute():
return _clean_path(env_path)
else:
return _clean_path(Path.home() / env_path)
else:
return default_path

class SystemPaths:
"""Enable consistent storage and retrieval of filesystem paths needed by the system

The paths that are used follow the following hierarchy:
- Absolute path specified by an environment variable
- Relative path specified by an environment variable, resolved against the home directory
- Default path for the operating system based on accepted standards

"""
app_name = "CollectOSS"
app_org = "CHAOSS"

@staticmethod
def os_defaults(create = True) -> PlatformDirs:
"""Get the set of conventional directories for the operating system"""
return PlatformDirs(SystemPaths.app_name, SystemPaths.app_org, ensure_exists=create)

@staticmethod
def get_facade_directory(create = True) -> Path:
"""Get the facade directory. Requires database for historical compatibility"""
env_path = _path_from_env(SystemEnv.get("COLLECTOSS_FACADE_REPO_DIRECTORY"))
database_path = None

from collectoss.application.config import SystemConfig
from collectoss.application.db.session import DatabaseSession
from collectoss.application.db import get_engine
with DatabaseSession(logger, get_engine()) as session:
config = SystemConfig(logger, session)
database_path = config.get_value("Facade", "repo_directory")


return _verify_path(
_build_path(env_path or database_path, SystemPaths.os_defaults(create).user_downloads_path / "collectoss_facade"),
create = create
)

@staticmethod
def facade_repo_path(repo) -> Path:
"""Get the path to a specific facade repository"""
return SystemPaths.get_facade_directory() / f"{repo.repo_id}-{repo.repo_path}/{repo.repo_name}"

@staticmethod
def get_config_directory(create = True) -> Path:
"""Get the config directory"""
env_path = _path_from_env(SystemEnv.get("COLLECTOSS_CONFIG_DIRECTORY") or SystemEnv.get("CONFIG_DATADIR"))

return _verify_path(
_build_path(env_path, SystemPaths.os_defaults(create).user_config_path),
create = create
)

@staticmethod
def get_logs_directory(create = True) -> Path:
"""Get the logs directory. Requires database for historical compatibility"""
env_path = _path_from_env(SystemEnv.get("COLLECTOSS_LOGS_DIRECTORY"))
database_path = None

from collectoss.application.config import SystemConfig
from collectoss.application.db.session import DatabaseSession
from collectoss.application.db import get_engine
with DatabaseSession(logger, get_engine()) as session:
config = SystemConfig(logger, session)
database_path = config.get_value("Logging", "logs_directory")

return _verify_path(
_build_path(env_path or database_path, SystemPaths.os_defaults(create).user_log_path),
create = create
)

@staticmethod
def get_cache_directory(create = True) -> Path:
"""Get the cache directory"""
env_path = _path_from_env(SystemEnv.get("COLLECTOSS_CACHE_DIRECTORY") or SystemEnv.get("CACHE_DATADIR"))

return _verify_path(
_build_path(env_path, SystemPaths.os_defaults(create).user_cache_path),
create = create
)


@staticmethod
def get_models_directory(create = True) -> Path:
"""Get the models directory. Requires database for historical compatibility"""
database_dirname = None

from collectoss.application.config import SystemConfig
from collectoss.application.db.session import DatabaseSession
from collectoss.application.db import get_engine
with DatabaseSession(logger, get_engine()) as session:
config = SystemConfig(logger, session)
database_dirname = config.get_value("Message_Insights", 'models_dir') or "message_models"

return _verify_path(
SystemPaths.os_defaults(create).user_data_path / "tasks" / "data_analysis" / "message_insights" / database_dirname,
create = create
)

@staticmethod
def get_model_training_data_directory(create = True) -> Path:
"""Get the model training data directory"""
env_path = _path_from_env(SystemEnv.get("COLLECTOSS_ANALYSIS_DIRECTORY"))
return _verify_path(
_build_path(env_path / "message_insights" / "train_data", SystemPaths.os_defaults(create).user_data_path / "tasks" / "data_analysis" / "message_insights" / "train_data"),
create = create
)

@staticmethod
def get_discourse_analysis_directory(create = True) -> Path:
"""Get the discourse analysis directory"""
env_path = _path_from_env(SystemEnv.get("COLLECTOSS_ANALYSIS_DIRECTORY"))
return _verify_path(
_build_path(env_path / "discourse_analysis", SystemPaths.os_defaults(create).user_data_path / "tasks" / "data_analysis" / "discourse_analysis"),
create = create
)

@staticmethod
def get_install_path() -> Path:
"""Get the path that CollectOSS is currently installed to. This should be treated as read- only."""
# This paths file is only one level below the root of the module.
# accessing above that is not possible as the module could be installed separately
return _verify_path(Path(__file__).parent, create = False)

@staticmethod
def print_all_paths(logger):
Comment thread
MoralCode marked this conversation as resolved.
logger.info(f"Install path: {SystemPaths.get_install_path()}")
logger.info(f"Facade directory: {SystemPaths.get_facade_directory(create = False)}")
logger.info(f"Config directory: {SystemPaths.get_config_directory(create = False)}")
logger.info(f"Logs directory: {SystemPaths.get_logs_directory(create = False)}")
logger.info(f"Cache directory: {SystemPaths.get_cache_directory(create = False)}")
logger.info(f"Models directory: {SystemPaths.get_models_directory(create = False)}")
logger.info(f"Model training data directory: {SystemPaths.get_model_training_data_directory(create = False)}")
logger.info(f"Discourse analysis directory: {SystemPaths.get_discourse_analysis_directory(create = False)}")
4 changes: 2 additions & 2 deletions collectoss/tasks/data_analysis/discourse_analysis/tasks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
from collectoss.application.paths import SystemPaths
import sqlalchemy as s
import pandas as pd
import pickle
Expand Down Expand Up @@ -29,8 +30,7 @@
# from os import path

stemmer = nltk.stem.snowball.SnowballStemmer("english")
ROOT_PROJECT_REPO_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))))
DISCOURSE_ANALYSIS_DIR = f"{ROOT_PROJECT_REPO_DIRECTORY}/tasks/data_analysis/discourse_analysis/"
DISCOURSE_ANALYSIS_DIR = SystemPaths.get_discourse_analysis_directory()

@celery.task(base=MLRepoCollectionTask, bind=True)
def discourse_analysis_task(self, repo_git):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
from datetime import datetime, timedelta

from collectoss.application.paths import SystemPaths
import numpy as np
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
Expand All @@ -16,10 +17,7 @@
from collectoss.tasks.data_analysis.message_insights.preprocess_text import \
normalize_corpus as normalize_corpus

ROOT_PROJECT_REPO_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))))


train_path = os.path.join(ROOT_PROJECT_REPO_DIRECTORY, "tasks", "data_analysis", "message_insights", "train_data")
train_path = SystemPaths.get_model_training_data_directory()

# ''' Doc2Vec model training

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import warnings
from statistics import mean

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pylint] reported by reviewdog 🐶
W0611: Unused mean imported from statistics (unused-import)


from collectoss.application.paths import SystemPaths
import emoji
import joblib
import nltk
Expand All @@ -30,11 +31,9 @@

warnings.filterwarnings('ignore')

ROOT_PROJECT_REPO_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))))

CONTRACTION_MAP = contraction_map

train_path = os.path.join(ROOT_PROJECT_REPO_DIRECTORY, "tasks", "data_analysis", "message_insights", "train_data")
train_path = SystemPaths.get_model_training_data_directory()

def replace_all(text, dic):
if(sys.version_info[0] < 3):
Expand Down
5 changes: 2 additions & 3 deletions collectoss/tasks/data_analysis/message_insights/tasks.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import datetime
import logging
import os
from collectoss.application.paths import SystemPaths
import numpy as np
import pandas as pd
import requests
Expand All @@ -18,8 +19,6 @@

#SPDX-License-Identifier: MIT

ROOT_PROJECT_REPO_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))))

@celery.task(base=MLRepoCollectionTask, bind=True)
def message_insight_task(self, repo_git):

Expand All @@ -45,7 +44,7 @@ def message_insight_model(repo_git: str,logger,engine) -> None:
repo = get_repo_by_repo_git(repo_git)
repo_id = repo.repo_id

models_dir = os.path.join(ROOT_PROJECT_REPO_DIRECTORY, "tasks", "data_analysis", "message_insights", get_value("Message_Insights", 'models_dir'))
models_dir = SystemPaths.get_models_directory()
insight_days = get_value("Message_Insights", 'insight_days')

# Any initial database instructions, like finding the last tuple inserted or generate the next ID value
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
import os
import datetime

from collectoss.application.paths import SystemPaths
import joblib
import pandas as pd
import sqlalchemy as s
Expand All @@ -18,8 +18,6 @@
# from sklearn.preprocessing import LabelEncoder, MinMaxScaler
# from xgboost import XGBClassifier

ROOT_PROJECT_REPO_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))))

@celery.task(base=MLRepoCollectionTask, bind=True)
def pull_request_analysis_task(self, repo_git):

Expand All @@ -40,7 +38,7 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None:

repo_id = get_repo_by_repo_git(repo_git).repo_id

senti_models_dir = os.path.join(ROOT_PROJECT_REPO_DIRECTORY, "tasks", "data_analysis", "message_insights", get_value("Message_Insights", 'models_dir'))
senti_models_dir = SystemPaths.get_models_directory()

logger.info(f'Sentiment model dir located - {senti_models_dir}')

Expand Down
Loading
Loading