Skip to content
This repository was archived by the owner on Nov 5, 2025. It is now read-only.

Commit c47bbed

Browse files
committed
Add utils and docs for downloading all contracts
1 parent 6db17c3 commit c47bbed

File tree

14 files changed

+327
-13
lines changed

14 files changed

+327
-13
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,5 +94,7 @@ media
9494
node_modules
9595

9696
.vscode/
97-
*.dump
9897
.idea/
98+
99+
docker/postgres/backup.dump
100+
data/contracts.jsonl.gz

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,9 +87,9 @@ If you want to go the usual route and run the project locally, though:
8787
- Install pre-commit hooks: `pipenv run pre-commit install`
8888
- Setup environment file: `cp example.env .env`
8989
- Install docker-compose: https://docs.docker.com/compose/install/
90+
- [Download latest database dump](docs/database-dump.md)
9091
- Build and start containers, including test container: `docker-compose up --build -d`
9192
- Create super user: `docker-compose exec web python manage.py createsuperuser`
92-
- Scrape some contracts: `docker-compose exec web python manage.py scrape_contracts --limit 100`
9393
- open http://localhost:8000
9494

9595
And you should be ready to go!

bin/docker-entrypoint

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,16 @@ postgres_ready(){
66
python manage.py shell << END
77
import sys
88
import psycopg2
9-
from django.db import connections
9+
import dj_database_url
1010
try:
11-
connections['default'].cursor()
11+
parsed = dj_database_url.parse("${DATABASE_URL}")
12+
psycopg2.connect(
13+
dbname=parsed["NAME"],
14+
user=parsed["USER"],
15+
password=parsed["PASSWORD"],
16+
host=parsed["HOST"],
17+
port=parsed["PORT"],
18+
)
1219
except psycopg2.OperationalError:
1320
sys.exit(-1)
1421
sys.exit(0)
@@ -35,6 +42,11 @@ case "$1" in
3542
;;
3643

3744
"run-tests")
45+
until postgres_ready; do
46+
>&2 echo "==> Waiting for Postgres..."
47+
sleep 1
48+
done
49+
3850
python runtests.py
3951
;;
4052

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import json
2+
3+
from django.core.management.base import BaseCommand
4+
from structlog import get_logger
5+
6+
from ...scraper import get_amendments, get_contractors, get_contracts, get_entities
7+
8+
logger = get_logger("contratospr.commands.download_contracts")
9+
10+
11+
def get_contracts_by_entity(entity):
12+
offset = 0
13+
total_records = 0
14+
limit = 1000
15+
16+
entity_id = entity["Code"]
17+
entity_name = entity["Name"].strip()
18+
19+
while offset <= total_records:
20+
logger.info(
21+
"Scraping contracts",
22+
limit=limit,
23+
entity_id=entity_id,
24+
entity_name=entity_name,
25+
offset=offset,
26+
total_records=total_records,
27+
)
28+
29+
contracts_json = get_contracts(offset, limit, entity_id=entity_id)
30+
31+
with open(f"data/contracts-{entity_id}-{offset}.json", "w+") as f:
32+
json.dump(contracts_json, f)
33+
34+
expanded_contracts = []
35+
36+
for contract_data in contracts_json.get("data", []):
37+
try:
38+
logger.info(
39+
"Getting contractors", contract_id=contract_data["ContractId"]
40+
)
41+
contract_data["_Contractors"] = get_contractors(
42+
contract_data["ContractId"]
43+
)
44+
contract_data["_Amendments"] = None
45+
46+
if contract_data["HasAmendments"]:
47+
logger.info(
48+
"Getting amendments",
49+
contract_number=contract_data["ContractNumber"],
50+
entity_id=contract_data["EntityId"],
51+
)
52+
contract_data["_Amendments"] = get_amendments(
53+
contract_data["ContractNumber"], contract_data["EntityId"]
54+
)
55+
except Exception as exc:
56+
logger.info(
57+
"Error extending contract",
58+
contract_id=contract_data["ContractId"],
59+
exception=exc,
60+
)
61+
62+
expanded_contracts.append(contract_data)
63+
64+
contracts_json["data"] = expanded_contracts
65+
66+
with open(f"data/contracts-{entity_id}-{offset}.json", "w+") as f:
67+
json.dump(contracts_json, f)
68+
69+
if not total_records:
70+
total_records = contracts_json["recordsFiltered"]
71+
72+
offset += limit
73+
74+
75+
def get_contracts_by_entities(entities):
76+
for entity in entities:
77+
get_contracts_by_entity(entity)
78+
79+
80+
class Command(BaseCommand):
81+
help = "Download contracts for entities from consultacontratos.ocpr.gov.pr"
82+
83+
def add_arguments(self, parser):
84+
parser.add_argument("--file", nargs="?", type=str, default=None)
85+
86+
def handle(self, *args, **options):
87+
entities_file_path = options.get("file")
88+
89+
if entities_file_path:
90+
entities = json.load(open(entities_file_path)).get("Results", [])
91+
else:
92+
entities = get_entities()
93+
94+
get_contracts_by_entities(entities)
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import gzip
2+
import json
3+
import os
4+
5+
from django.core.management.base import BaseCommand
6+
from structlog import get_logger
7+
8+
from ...tasks import normalize_contract, normalize_contractors, update_contract
9+
10+
logger = get_logger("contratospr.commands.import_contracts")
11+
12+
13+
def _normalize_contract(contract):
14+
normalized_contract = normalize_contract(contract)
15+
normalized_contract["contractors"] = normalize_contractors(
16+
contract.get("_Contractors", [])
17+
)
18+
19+
if contract.get("_Amendments"):
20+
for amendment in contract.get("_Amendments", []):
21+
normalized_contract["amendments"].append(_normalize_contract(amendment))
22+
23+
return normalized_contract
24+
25+
26+
def import_contracts(contracts):
27+
for contract in contracts:
28+
normalized = _normalize_contract(contract)
29+
30+
logger.info(
31+
"Importing contract",
32+
contract_id=normalized["contract_id"],
33+
entity_id=normalized["entity_id"],
34+
)
35+
36+
update_contract(normalized)
37+
38+
39+
class Command(BaseCommand):
40+
help = "Import contracts"
41+
42+
def handle(self, *args, **options):
43+
gzipped_merged_file_name = "contracts.jsonl.gz"
44+
gzipped_merged_file_path = os.path.join("data", gzipped_merged_file_name)
45+
46+
with gzip.open(gzipped_merged_file_path, "r") as f:
47+
for jsonline in f:
48+
contracts_data = json.loads(jsonline)
49+
import_contracts(contracts_data)
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import gzip
2+
import json
3+
import os
4+
import shutil
5+
6+
from django.core.management.base import BaseCommand
7+
from structlog import get_logger
8+
9+
logger = get_logger("contratospr.commands.merge_contracts")
10+
11+
12+
class Command(BaseCommand):
13+
help = "Merge contracts"
14+
15+
def handle(self, *args, **options):
16+
merged_file_name = "contracts.json"
17+
merged_file_path = os.path.join("data", merged_file_name)
18+
gzipped_merged_file_name = "contracts.jsonl.gz"
19+
gzipped_merged_file_path = os.path.join("data", gzipped_merged_file_name)
20+
21+
for filename in os.listdir("data"):
22+
file_path = os.path.join("data", filename)
23+
24+
if (
25+
os.path.isfile(file_path)
26+
and file_path.endswith(".json")
27+
and file_path != merged_file_name
28+
and file_path != gzipped_merged_file_name
29+
):
30+
logger.info("Merging contracts", filename=filename)
31+
32+
with open(file_path) as f:
33+
contracts_json = json.load(f)
34+
35+
contracts_data = contracts_json.get("data", [])
36+
37+
json_str = f"{json.dumps(contracts_data)}\n"
38+
39+
with open(merged_file_path, "a+") as f:
40+
f.write(json_str)
41+
42+
with open(merged_file_path, "rb") as f_in:
43+
with gzip.open(gzipped_merged_file_path, "wb") as f_out:
44+
shutil.copyfileobj(f_in, f_out)

contratospr/contracts/scraper.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
import requests
44

5+
from ..utils.requests_retry import requests_retry_session
6+
57
BASE_URL = "https://consultacontratos.ocpr.gov.pr"
68
BASE_CONTRACT_URL = f"{BASE_URL}/contract"
79
BASE_CONTRACTOR_URL = f"{BASE_URL}/contractor"
@@ -21,6 +23,9 @@
2123
]
2224

2325

26+
session = requests_retry_session(retries=3, backoff_factor=0.3)
27+
28+
2429
def send_document_request(contract_id):
2530
response = requests.post(
2631
f"{BASE_CONTRACT_URL}/senddocumentrequest",
@@ -32,7 +37,7 @@ def send_document_request(contract_id):
3237

3338

3439
def get_contractors(contract_id):
35-
response = requests.post(
40+
response = session.post(
3641
f"{BASE_CONTRACTOR_URL}/findbycontractid",
3742
json={"contractId": contract_id},
3843
headers={"user-agent": random.choice(USER_AGENTS)},
@@ -42,7 +47,7 @@ def get_contractors(contract_id):
4247

4348

4449
def get_amendments(contract_number, entity_id):
45-
response = requests.post(
50+
response = session.post(
4651
f"{BASE_CONTRACT_URL}/getamendments",
4752
json={"contractNumber": contract_number, "entityId": entity_id},
4853
headers={"user-agent": random.choice(USER_AGENTS)},
@@ -52,7 +57,7 @@ def get_amendments(contract_number, entity_id):
5257

5358

5459
def get_contracts(offset, limit, **kwargs):
55-
response = requests.post(
60+
response = session.post(
5661
f"{BASE_CONTRACT_URL}/search",
5762
json={
5863
"draw": 1,
@@ -154,3 +159,12 @@ def get_contracts(offset, limit, **kwargs):
154159
)
155160

156161
return response.json()
162+
163+
164+
def get_entities():
165+
response = session.get(
166+
f"{BASE_URL}/entity/findby?name=&pageIndex=1&pageSize=1000",
167+
headers={"user-agent": random.choice(USER_AGENTS)},
168+
)
169+
170+
return response.json().get("Results", [])

contratospr/contracts/tasks.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,7 @@ def strip_whitespace(value):
3030
return value.strip() if value else None
3131

3232

33-
@app.task
34-
def expand_contract(contract):
35-
logger.info("Expanding contract", contract=contract["ContractNumber"])
36-
33+
def normalize_contract(contract):
3734
result = {
3835
"entity_id": contract["EntityId"],
3936
"entity_name": strip_whitespace(contract["EntityName"]),
@@ -60,17 +57,34 @@ def expand_contract(contract):
6057
"document_url"
6158
] = f"{BASE_CONTRACT_URL}/downloaddocument?documentid={document_id}"
6259

63-
contractors = get_contractors(result["contract_id"])
60+
return result
61+
62+
63+
def normalize_contractors(contractors):
64+
results = []
6465

6566
for contractor in contractors:
66-
result["contractors"].append(
67+
results.append(
6768
{
6869
"contractor_id": contractor["ContractorId"],
6970
"entity_id": contractor["EntityId"],
7071
"name": contractor["Name"],
7172
}
7273
)
7374

75+
return results
76+
77+
78+
@app.task
79+
def expand_contract(contract):
80+
logger.info("Expanding contract", contract=contract["ContractNumber"])
81+
82+
result = normalize_contract(contract)
83+
84+
contractors = get_contractors(result["contract_id"])
85+
86+
result["contractors"] = normalize_contractors(contractors)
87+
7488
if result["has_amendments"]:
7589
amendments = get_amendments(result["contract_number"], result["entity_id"])
7690

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import requests
2+
from requests.adapters import HTTPAdapter
3+
from requests.packages.urllib3.util.retry import Retry
4+
5+
6+
# From https://www.peterbe.com/plog/best-practice-with-retries-with-requests
7+
def requests_retry_session(
8+
retries=3, backoff_factor=0.3, status_forcelist=(500, 502, 504), session=None
9+
):
10+
session = session or requests.Session()
11+
retry = Retry(
12+
total=retries,
13+
read=retries,
14+
connect=retries,
15+
backoff_factor=backoff_factor,
16+
status_forcelist=status_forcelist,
17+
)
18+
adapter = HTTPAdapter(max_retries=retry)
19+
session.mount("http://", adapter)
20+
session.mount("https://", adapter)
21+
return session

docker-compose.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,12 @@ services:
1111
database:
1212
image: postgres:10.6
1313
environment:
14+
- POSTGRES_USER=postgres
15+
- POSTGRES_DB=postgres
1416
- POSTGRES_PASSWORD=password
1517
volumes:
1618
- database:/var/lib/postgresql/data
19+
- ./docker/postgres:/docker-entrypoint-initdb.d
1720

1821
redis:
1922
image: redis:3.2
@@ -71,6 +74,7 @@ services:
7174
command: run-tests
7275
entrypoint: /app/bin/docker-entrypoint
7376
environment:
77+
- ENVIRONMENT=testing
7478
- DATABASE_URL=postgres://postgres:password@database/postgres
7579
- REDIS_URL=redis://redis:6379
7680
volumes:

0 commit comments

Comments
 (0)