Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
### 🔧 Internal changes

- Refactored the `Courses` table to `Course` with a database migration
- Refactor `parseArtSci` function in `app/WebParsing/ArtSciParser.hs` by introducing `parseDepartmentList`

## [0.8.0] - 2026-06-09

Expand Down
22 changes: 19 additions & 3 deletions app/WebParsing/ArtSciParser.hs
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
module WebParsing.ArtSciParser
(parseCalendar, getDeptList) where
(parseCalendar, getDeptList, parseDepartmentList) where

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While parseDepartmentList is good to be exported for testing purposes, getDeptList isn't used anywhere else in the codebase so we can remove it from this export list.


import Config (fasCalendarUrl, programsUrl, runDb)
import Control.Monad.IO.Class (liftIO)
import Data.List (findIndex, nubBy)
import Data.Maybe (fromMaybe, mapMaybe)
import qualified Data.Text as T
import qualified Data.Bifunctor as BF
import Data.Text.Lazy (toStrict)
import Data.Text.Lazy.Encoding (decodeUtf8)
import Database.Persist (insertUnique)
Expand Down Expand Up @@ -34,13 +35,28 @@ parseCalendar = do
parseArtSci :: IO ()
parseArtSci = do
programs <- programsUrl
bodyTags <- httpBodyTags programs
let deptInfo = getDeptList bodyTags
deptInfo <- parseDepartmentList programs
runDb $ do
liftIO $ putStrLn "Inserting departments"
insertDepts $ map snd deptInfo
mapM_ parseDepartment (nubBy (\(x, _) (y, _) -> x == y) deptInfo)

-- | Parse the list of all departments, given the URL of the program/subject areas page.
-- Exclude departments with no courses, duplicate courses, and program areas belonging to a college.
parseDepartmentList :: String -> IO [(T.Text, T.Text)]
parseDepartmentList url = do
let ignoredDepts = ["ASIP (Arts & Science Internship Program)",
"Biology",
"Combined Degree Programs",
"Data Science",
"Faculty of Arts and Science Programs (299/398/399)",
"Laboratory Medicine and Pathobiology)", -- | Displayed as "Pathobiology (see Laboratory Medicine and Pathobiology)" on program areas page
"Research Opportunity/Research Excursions (299/398/399)"]
bodyTags <- httpBodyTags url
let deptList = getDeptList bodyTags
let cleaned = map (BF.second (T.replace "\160" " ")) deptList

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

First, move this into getDeptList (i.e., we don't need to separate out the "cleaning" step). That function currently only calls T.strip, so it can be simplified by using getDeptList anyways.

Second, rather than do this replacing, please use the existing cleanText. That said, replacing "\160" with " " is better than replacing it with "", so you can also go ahead and modify cleanText itself. (See here for a reference for the '\160' char.)

return $ filter (\(deptPage, deptName) -> "/" `T.isPrefixOf` deptPage && deptName `notElem` ignoredDepts && not (" College)" `T.isSuffixOf` deptName)) cleaned

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As was mentioned at our meeting yesterday, this filtering is a great candidate for a local helper function. The idiomatic way to define this is to use a where block in this function:

    return filterDepartments deptList
    where
        filterDepartments :: [(T.Text, T.Text)] -> [(T.Text, T.Text)]
        ...


-- | Converts the processed main page and extracts a list of department html pages
-- and department names
getDeptList :: [Tag T.Text] -> [(T.Text, T.Text)]
Expand Down
128 changes: 128 additions & 0 deletions backend-test/WebParsing/ArtSciParserTests.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
{-|
Description: ArtSciParser module tests.

Module that contains the tests for the functions in the ArtSciParser module.

-}

module WebParsing.ArtSciParserTests
( test_artSciParser
) where

import qualified Data.Text as T
import WebParsing.ArtSciParser (parseDepartmentList)
import Test.Tasty (TestTree, testGroup)
import Test.Tasty.HUnit (assertEqual, testCase)

parsedDepts :: [(T.Text, T.Text)]
parsedDepts =
[ ("/section/Academic-Bridging-Program", "Academic Bridging Program"),
("/section/Actuarial-Science", "Actuarial Science"),
("/section/African-Studies", "African Studies Centre"),
("/section/American-Studies", "American Studies"),
("/section/Anatomy", "Anatomy"),
("/section/Anthropology", "Anthropology"),
("/section/Archaeology", "Archaeology"),
("/section/Architecture-and-Visual-Studies", "Architecture and Visual Studies"),
("/section/Art-History", "Art History"),
("/section/Astronomy-and-Astrophysics", "Astronomy and Astrophysics"),
("/section/Biochemistry", "Biochemistry"),
("/section/Business-Fundamentals", "Business Fundamentals"),
("/section/Centre-for-Caribbean-Studies", "Caribbean Studies, Centre for"),
("/section/Cell-and-Systems-Biology", "Cell and Systems Biology"),
("/section/Chemistry", "Chemistry"),
("/section/Cinema-Studies-Institute", "Cinema Studies (Cinema Studies Institute)"),
("/section/Classics", "Classics"),
("/section/Computer-Science", "Computer Science"),
("/section/Contemporary-Asian-Studies", "Contemporary Asian Studies, Dr. David Chu Program in"),
("/section/Criminology-and-Sociolegal-Studies", "Criminology and Sociolegal Studies, Centre for"),
("/section/Diaspora-and-Transnational-Studies", "Diaspora and Transnational Studies"),
("/section/Drama,-Theatre-and-Performance-Studies", "Drama, Theatre and Performance Studies, Centre for"),
("/section/Earth-Sciences", "Earth Sciences"),
("/section/East-Asian-Studies", "East Asian Studies"),
("/section/Ecology-and-Evolutionary-Biology", "Ecology and Evolutionary Biology"),
("/section/Economics", "Economics"),
("/section/English", "English"),
("/section/Centre-for-Entrepreneurship", "Entrepreneurship, Centre for"),
("/section/School-of-the-Environment", "Environment (School of the Environment)"),
("/section/Slavic-and-East-European-Languages-and-Cultures", "Estonian"),
("/section/Centre-for-Ethics", "Ethics, Centre for"),
("/section/European-Affairs", "European Affairs"),
("/section/Slavic-and-East-European-Languages-and-Cultures", "Finnish"),
("/section/First-Year-Foundations", "First-Year Foundations"),
("/section/Forest-Conservation-and-Forest-Biomaterials-Science", "Forest Conservation and Forest Biomaterials Science"),
("/section/French", "French"),
("/section/Geography-and-Planning", "Geography and Planning"),
("/section/German", "German"),
("/section/History", "History"),
("/section/History-and-Philosophy-of-Science-and-Technology", "History and Philosophy of Science and Technology"),
("/section/Human-Biology", "Human Biology"),
("/section/Hungarian", "Hungarian"),
("/section/Immunology", "Immunology"),
("/section/Indigenous-Studies", "Indigenous Studies"),
("/section/Industrial-Relations-and-Human-Resources", "Industrial Relations and Human Resources, Centre for"),
("/section/Innis-College", "Innis College"),
("/section/Italian", "Italian"),
("/section/Centre-for-Jewish-Studies", "Jewish Studies, Centre for"),
("/section/Laboratory-Medicine-and-Pathobiology", "Laboratory Medicine and Pathobiology"),
("/section/Latin-American-Studies", "Latin American Studies"),
("/section/Linguistics", "Linguistics"),
("/section/Materials-Science", "Materials Science"),
("/section/Mathematics", "Mathematics"),
("/section/Centre-for-Medieval-Studies", "Medieval Studies, Centre for"),
("/section/Molecular-Genetics-and-Microbiology", "Molecular Genetics and Microbiology"),
("/section/Munk-School-of-Global-Affairs-and-Public-Policy", "Munk School of Global Affairs and Public Policy"),
("/section/Music", "Music"),
("/section/Near-and-Middle-Eastern-Civilizations", "Near and Middle Eastern Civilizations"),
("/section/New-College", "New College"),
("/section/Nutritional-Sciences", "Nutritional Sciences"),
("/section/Munk-School-of-Global-Affairs-and-Public-Policy", "Peace, Conflict and Justice"),
("/section/Pharmacology-and-Toxicology", "Pharmacology and Toxicology"),
("/section/Philosophy", "Philosophy"),
("/section/Physics", "Physics"),
("/section/Physiology", "Physiology"),
("/section/Planetary-Science", "Planetary Science"),
("/section/Political-Science", "Political Science"),
("/section/Portuguese", "Portuguese"),
("/section/Psychology", "Psychology"),
("/section/Munk-School-of-Global-Affairs-and-Public-Policy", "Public Policy"),
("/section/Religion", "Religion"),
("/section/Rotman-Commerce", "Rotman Commerce"),
("/section/St.-Michael's-College", "St. Michael's College"),
("/section/Sexual-Diversity-Studies", "Sexual Diversity Studies, Mark S. Bonham Centre for"),
("/section/Slavic-and-East-European-Languages-and-Cultures", "Slavic and East European Languages and Cultures"),
("/section/Sociology", "Sociology"),
("/section/South-Asian-Studies", "South Asian Studies"),
("/section/Spanish", "Spanish"),
("/section/Statistical-Sciences", "Statistical Sciences"),
("/section/Canadian-Institute-for-Theoretical-Astrophysics", "Theoretical Astrophysics (Canadian Institute for Theoretical Astrophysics)"),
("/section/Trinity-College", "Trinity College"),
("/section/University-College", "University College"),
("/section/Geography-and-Planning", "Urban Studies"),
("/section/Victoria-College", "Victoria College"),
("/section/Women-and-Gender-Studies", "Women and Gender Studies"),
("/section/Woodsworth-College", "Woodsworth College"),
("/writing-faculty-arts-science", "Writing in the Faculty of Arts & Science"),
("/section/Yiddish-Studies", "Yiddish Studies")
]

-- | List of test cases as (label, input URL, expected output)
parseDeptListTestCases :: [(String, String, [(T.Text, T.Text)])]
parseDeptListTestCases =
[ ("Program/subject areas page", "https://artsci.calendar.utoronto.ca/listing-program-subject-areas", parsedDepts) ]

-- | Run a test case (label, input URL, expected output) on the parseDepartmentList function.
runParseDeptListTest :: (String, String, [(T.Text, T.Text)]) -> TestTree
runParseDeptListTest (label, input, expected) =
testCase label $ do
actual <- parseDepartmentList input
assertEqual ("Unexpected parsing result for " ++ label) expected actual

-- | Run all the parseDeptList test cases
runParseDeptListTests :: [TestTree]
runParseDeptListTests = map runParseDeptListTest parseDeptListTestCases

-- | Test suite for ArtSciParser module
test_artSciParser :: TestTree
test_artSciParser =
testGroup "ArtSciParser tests" runParseDeptListTests