-
Notifications
You must be signed in to change notification settings - Fork 75
Refactor parseArtSci function and add tests for newly introduced parseDepartmentList #1725
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
417223d
622c635
6f84688
d208de1
ec39261
309e313
67684a4
6752a68
caa36b4
12d1582
acc4003
7965eb6
38f79d6
6f6013a
d63629a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,11 +1,12 @@ | ||
| module WebParsing.ArtSciParser | ||
| (parseCalendar, getDeptList) where | ||
| (parseCalendar, getDeptList, parseDepartmentList) where | ||
|
|
||
| import Config (fasCalendarUrl, programsUrl, runDb) | ||
| import Control.Monad.IO.Class (liftIO) | ||
| import Data.List (findIndex, nubBy) | ||
| import Data.Maybe (fromMaybe, mapMaybe) | ||
| import qualified Data.Text as T | ||
| import qualified Data.Bifunctor as BF | ||
| import Data.Text.Lazy (toStrict) | ||
| import Data.Text.Lazy.Encoding (decodeUtf8) | ||
| import Database.Persist (insertUnique) | ||
|
|
@@ -34,13 +35,28 @@ parseCalendar = do | |
| parseArtSci :: IO () | ||
| parseArtSci = do | ||
| programs <- programsUrl | ||
| bodyTags <- httpBodyTags programs | ||
| let deptInfo = getDeptList bodyTags | ||
| deptInfo <- parseDepartmentList programs | ||
| runDb $ do | ||
| liftIO $ putStrLn "Inserting departments" | ||
| insertDepts $ map snd deptInfo | ||
| mapM_ parseDepartment (nubBy (\(x, _) (y, _) -> x == y) deptInfo) | ||
|
|
||
| -- | Parse the list of all departments, given the URL of the program/subject areas page. | ||
| -- Exclude departments with no courses, duplicate courses, and program areas belonging to a college. | ||
| parseDepartmentList :: String -> IO [(T.Text, T.Text)] | ||
| parseDepartmentList url = do | ||
| let ignoredDepts = ["ASIP (Arts & Science Internship Program)", | ||
| "Biology", | ||
| "Combined Degree Programs", | ||
| "Data Science", | ||
| "Faculty of Arts and Science Programs (299/398/399)", | ||
| "Laboratory Medicine and Pathobiology)", -- | Displayed as "Pathobiology (see Laboratory Medicine and Pathobiology)" on program areas page | ||
| "Research Opportunity/Research Excursions (299/398/399)"] | ||
| bodyTags <- httpBodyTags url | ||
| let deptList = getDeptList bodyTags | ||
| let cleaned = map (BF.second (T.replace "\160" " ")) deptList | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. First, move this into Second, rather than do this replacing, please use the existing |
||
| return $ filter (\(deptPage, deptName) -> "/" `T.isPrefixOf` deptPage && deptName `notElem` ignoredDepts && not (" College)" `T.isSuffixOf` deptName)) cleaned | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As was mentioned at our meeting yesterday, this filtering is a great candidate for a local helper function. The idiomatic way to define this is to use a return filterDepartments deptList
where
filterDepartments :: [(T.Text, T.Text)] -> [(T.Text, T.Text)]
... |
||
|
|
||
| -- | Converts the processed main page and extracts a list of department html pages | ||
| -- and department names | ||
| getDeptList :: [Tag T.Text] -> [(T.Text, T.Text)] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,128 @@ | ||
| {-| | ||
| Description: ArtSciParser module tests. | ||
|
|
||
| Module that contains the tests for the functions in the ArtSciParser module. | ||
|
|
||
| -} | ||
|
|
||
| module WebParsing.ArtSciParserTests | ||
| ( test_artSciParser | ||
| ) where | ||
|
|
||
| import qualified Data.Text as T | ||
| import WebParsing.ArtSciParser (parseDepartmentList) | ||
| import Test.Tasty (TestTree, testGroup) | ||
| import Test.Tasty.HUnit (assertEqual, testCase) | ||
|
|
||
| parsedDepts :: [(T.Text, T.Text)] | ||
| parsedDepts = | ||
| [ ("/section/Academic-Bridging-Program", "Academic Bridging Program"), | ||
| ("/section/Actuarial-Science", "Actuarial Science"), | ||
| ("/section/African-Studies", "African Studies Centre"), | ||
| ("/section/American-Studies", "American Studies"), | ||
| ("/section/Anatomy", "Anatomy"), | ||
| ("/section/Anthropology", "Anthropology"), | ||
| ("/section/Archaeology", "Archaeology"), | ||
| ("/section/Architecture-and-Visual-Studies", "Architecture and Visual Studies"), | ||
| ("/section/Art-History", "Art History"), | ||
| ("/section/Astronomy-and-Astrophysics", "Astronomy and Astrophysics"), | ||
| ("/section/Biochemistry", "Biochemistry"), | ||
| ("/section/Business-Fundamentals", "Business Fundamentals"), | ||
| ("/section/Centre-for-Caribbean-Studies", "Caribbean Studies, Centre for"), | ||
| ("/section/Cell-and-Systems-Biology", "Cell and Systems Biology"), | ||
| ("/section/Chemistry", "Chemistry"), | ||
| ("/section/Cinema-Studies-Institute", "Cinema Studies (Cinema Studies Institute)"), | ||
| ("/section/Classics", "Classics"), | ||
| ("/section/Computer-Science", "Computer Science"), | ||
| ("/section/Contemporary-Asian-Studies", "Contemporary Asian Studies, Dr. David Chu Program in"), | ||
| ("/section/Criminology-and-Sociolegal-Studies", "Criminology and Sociolegal Studies, Centre for"), | ||
| ("/section/Diaspora-and-Transnational-Studies", "Diaspora and Transnational Studies"), | ||
| ("/section/Drama,-Theatre-and-Performance-Studies", "Drama, Theatre and Performance Studies, Centre for"), | ||
| ("/section/Earth-Sciences", "Earth Sciences"), | ||
| ("/section/East-Asian-Studies", "East Asian Studies"), | ||
| ("/section/Ecology-and-Evolutionary-Biology", "Ecology and Evolutionary Biology"), | ||
| ("/section/Economics", "Economics"), | ||
| ("/section/English", "English"), | ||
| ("/section/Centre-for-Entrepreneurship", "Entrepreneurship, Centre for"), | ||
| ("/section/School-of-the-Environment", "Environment (School of the Environment)"), | ||
| ("/section/Slavic-and-East-European-Languages-and-Cultures", "Estonian"), | ||
| ("/section/Centre-for-Ethics", "Ethics, Centre for"), | ||
| ("/section/European-Affairs", "European Affairs"), | ||
| ("/section/Slavic-and-East-European-Languages-and-Cultures", "Finnish"), | ||
| ("/section/First-Year-Foundations", "First-Year Foundations"), | ||
| ("/section/Forest-Conservation-and-Forest-Biomaterials-Science", "Forest Conservation and Forest Biomaterials Science"), | ||
| ("/section/French", "French"), | ||
| ("/section/Geography-and-Planning", "Geography and Planning"), | ||
| ("/section/German", "German"), | ||
| ("/section/History", "History"), | ||
| ("/section/History-and-Philosophy-of-Science-and-Technology", "History and Philosophy of Science and Technology"), | ||
| ("/section/Human-Biology", "Human Biology"), | ||
| ("/section/Hungarian", "Hungarian"), | ||
| ("/section/Immunology", "Immunology"), | ||
| ("/section/Indigenous-Studies", "Indigenous Studies"), | ||
| ("/section/Industrial-Relations-and-Human-Resources", "Industrial Relations and Human Resources, Centre for"), | ||
| ("/section/Innis-College", "Innis College"), | ||
| ("/section/Italian", "Italian"), | ||
| ("/section/Centre-for-Jewish-Studies", "Jewish Studies, Centre for"), | ||
| ("/section/Laboratory-Medicine-and-Pathobiology", "Laboratory Medicine and Pathobiology"), | ||
| ("/section/Latin-American-Studies", "Latin American Studies"), | ||
| ("/section/Linguistics", "Linguistics"), | ||
| ("/section/Materials-Science", "Materials Science"), | ||
| ("/section/Mathematics", "Mathematics"), | ||
| ("/section/Centre-for-Medieval-Studies", "Medieval Studies, Centre for"), | ||
| ("/section/Molecular-Genetics-and-Microbiology", "Molecular Genetics and Microbiology"), | ||
| ("/section/Munk-School-of-Global-Affairs-and-Public-Policy", "Munk School of Global Affairs and Public Policy"), | ||
| ("/section/Music", "Music"), | ||
| ("/section/Near-and-Middle-Eastern-Civilizations", "Near and Middle Eastern Civilizations"), | ||
| ("/section/New-College", "New College"), | ||
| ("/section/Nutritional-Sciences", "Nutritional Sciences"), | ||
| ("/section/Munk-School-of-Global-Affairs-and-Public-Policy", "Peace, Conflict and Justice"), | ||
| ("/section/Pharmacology-and-Toxicology", "Pharmacology and Toxicology"), | ||
| ("/section/Philosophy", "Philosophy"), | ||
| ("/section/Physics", "Physics"), | ||
| ("/section/Physiology", "Physiology"), | ||
| ("/section/Planetary-Science", "Planetary Science"), | ||
| ("/section/Political-Science", "Political Science"), | ||
| ("/section/Portuguese", "Portuguese"), | ||
| ("/section/Psychology", "Psychology"), | ||
| ("/section/Munk-School-of-Global-Affairs-and-Public-Policy", "Public Policy"), | ||
| ("/section/Religion", "Religion"), | ||
| ("/section/Rotman-Commerce", "Rotman Commerce"), | ||
| ("/section/St.-Michael's-College", "St. Michael's College"), | ||
| ("/section/Sexual-Diversity-Studies", "Sexual Diversity Studies, Mark S. Bonham Centre for"), | ||
| ("/section/Slavic-and-East-European-Languages-and-Cultures", "Slavic and East European Languages and Cultures"), | ||
| ("/section/Sociology", "Sociology"), | ||
| ("/section/South-Asian-Studies", "South Asian Studies"), | ||
| ("/section/Spanish", "Spanish"), | ||
| ("/section/Statistical-Sciences", "Statistical Sciences"), | ||
| ("/section/Canadian-Institute-for-Theoretical-Astrophysics", "Theoretical Astrophysics (Canadian Institute for Theoretical Astrophysics)"), | ||
| ("/section/Trinity-College", "Trinity College"), | ||
| ("/section/University-College", "University College"), | ||
| ("/section/Geography-and-Planning", "Urban Studies"), | ||
| ("/section/Victoria-College", "Victoria College"), | ||
| ("/section/Women-and-Gender-Studies", "Women and Gender Studies"), | ||
| ("/section/Woodsworth-College", "Woodsworth College"), | ||
| ("/writing-faculty-arts-science", "Writing in the Faculty of Arts & Science"), | ||
| ("/section/Yiddish-Studies", "Yiddish Studies") | ||
| ] | ||
|
|
||
| -- | List of test cases as (label, input URL, expected output) | ||
| parseDeptListTestCases :: [(String, String, [(T.Text, T.Text)])] | ||
| parseDeptListTestCases = | ||
| [ ("Program/subject areas page", "https://artsci.calendar.utoronto.ca/listing-program-subject-areas", parsedDepts) ] | ||
|
|
||
| -- | Run a test case (label, input URL, expected output) on the parseDepartmentList function. | ||
| runParseDeptListTest :: (String, String, [(T.Text, T.Text)]) -> TestTree | ||
| runParseDeptListTest (label, input, expected) = | ||
| testCase label $ do | ||
| actual <- parseDepartmentList input | ||
| assertEqual ("Unexpected parsing result for " ++ label) expected actual | ||
|
|
||
| -- | Run all the parseDeptList test cases | ||
| runParseDeptListTests :: [TestTree] | ||
| runParseDeptListTests = map runParseDeptListTest parseDeptListTestCases | ||
|
|
||
| -- | Test suite for ArtSciParser module | ||
| test_artSciParser :: TestTree | ||
| test_artSciParser = | ||
| testGroup "ArtSciParser tests" runParseDeptListTests |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
While
parseDepartmentListis good to be exported for testing purposes,getDeptListisn't used anywhere else in the codebase so we can remove it from this export list.