Skip to content
This repository was archived by the owner on Nov 5, 2025. It is now read-only.

Commit c2e664f

Browse files
committed
Simplify scraper task
1 parent 4c93f8e commit c2e664f

File tree

4 files changed

+36
-8
lines changed

4 files changed

+36
-8
lines changed

contratospr/contracts/management/commands/scrape_contracts.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ class Command(BaseCommand):
77
help = "Scrape search results"
88

99
def add_arguments(self, parser):
10+
parser.add_argument("--max-items", nargs="?", type=int, default=None)
1011
parser.add_argument("--limit", nargs="?", type=int, default=None)
1112
parser.add_argument("--date-of-grant-start", nargs="?", type=str, default=None)
1213
parser.add_argument("--date-of-grant-end", nargs="?", type=str, default=None)
@@ -22,4 +23,5 @@ def add_arguments(self, parser):
2223

2324
def handle(self, *args, **options):
2425
limit = options.pop("limit", None)
25-
scrape_contracts.delay(limit=limit, **options)
26+
max_items = options.pop("max_items", None)
27+
scrape_contracts.delay(limit=limit, max_items=max_items, **options)

contratospr/contracts/scraper.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ def get_contracts(offset, limit, **kwargs):
135135
"search": {"value": "", "regex": False},
136136
},
137137
],
138-
"order": [{"column": 3, "dir": "desc"}, {"column": 6, "dir": "desc"}],
138+
"order": [{"column": 1, "dir": "desc"}],
139139
"start": offset,
140140
"length": limit,
141141
"EntityId": kwargs.get("entity_id"),

contratospr/contracts/tasks.py

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import re
33

44
import pytz
5+
from structlog import get_logger
56

67
from ..tasks import app
78
from .models import Contract, Contractor, Document, Entity, Service, ServiceGroup
@@ -14,6 +15,8 @@
1415
)
1516
from .search import index_contract
1617

18+
logger = get_logger(__name__)
19+
1720

1821
def parse_date(value):
1922
if not value:
@@ -29,6 +32,8 @@ def strip_whitespace(value):
2932

3033
@app.task
3134
def expand_contract(contract):
35+
logger.info("Expanding contract", contract=contract["ContractNumber"])
36+
3237
result = {
3338
"entity_id": contract["EntityId"],
3439
"entity_name": strip_whitespace(contract["EntityName"]),
@@ -118,6 +123,10 @@ def request_contract_document(contract_id):
118123

119124
@app.task
120125
def update_contract(result, parent_id=None):
126+
logger.info(
127+
"Updating contract", contract=result["contract_number"], parent_id=parent_id
128+
)
129+
121130
entity, _ = Entity.objects.get_or_create(
122131
source_id=result["entity_id"], defaults={"name": result["entity_name"]}
123132
)
@@ -175,20 +184,28 @@ def update_contract(result, parent_id=None):
175184

176185

177186
@app.task
178-
def scrape_contracts(limit=None, **kwargs):
187+
def scrape_contracts(limit=None, max_items=None, **kwargs):
179188
offset = 0
180189
total_records = 0
181-
default_limit = 1000
190+
default_limit = 10
191+
real_limit = limit or default_limit
182192

183193
while offset <= total_records:
184-
real_limit = limit or default_limit
194+
logger.info(
195+
"Scraping contracts",
196+
limit=limit,
197+
real_limit=real_limit,
198+
offset=offset,
199+
total_records=total_records,
200+
)
201+
185202
contracts = get_contracts(offset, real_limit, **kwargs)
186203

187204
if not total_records:
188-
total_records = limit if limit else contracts["recordsFiltered"]
205+
total_records = max_items if max_items else contracts["recordsFiltered"]
189206

190207
for contract in contracts["data"]:
191-
chain = expand_contract.s(contract) | update_contract.s()
192-
chain()
208+
expanded = expand_contract(contract)
209+
update_contract(expanded)
193210

194211
offset += real_limit

contratospr/tasks.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import os
22

33
import configurations
4+
import structlog
45
from celery import Celery
6+
from celery.signals import task_prerun
57

68
configuration = os.getenv("ENVIRONMENT", "development").title()
79
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "contratospr.settings")
@@ -11,6 +13,13 @@
1113

1214
from django.conf import settings # noqa isort:skip
1315

16+
17+
@task_prerun.connect
18+
def configure_structlog(sender, body=None, **kwargs):
19+
logger = structlog.get_logger("contratospr.tasks")
20+
logger.new(task_id=kwargs["task_id"], task_name=sender.__name__)
21+
22+
1423
app = Celery("contratospr")
1524
app.config_from_object(settings, namespace="CELERY")
1625
app.autodiscover_tasks(lambda: settings.INSTALLED_APPS)

0 commit comments

Comments
 (0)