@@ -1655,3 +1655,167 @@ class Squad(Base):
16551655 ann = dataset_dec (base = Squad , url = "http://test.com" )
16561656 dw = DatasetWrapper (ann , Squad )
16571657 assert dw .id == "stanford.qa.squad"
1658+
1659+
1660+ # ==== Redownload with folder-path == datapath ====
1661+
1662+
1663+ class FolderAtDatapath (FolderResource ):
1664+ """A FolderResource whose path collapses to dataset.datapath.
1665+
1666+ Simulates ArchiveDownloader with a single resource,
1667+ where ``path`` returns ``dataset.datapath`` instead of a subdirectory.
1668+ """
1669+
1670+ def __init__ (self , ** kw ):
1671+ super ().__init__ (** kw )
1672+ self ._download_called = False
1673+
1674+ @property
1675+ def path (self ) -> Path :
1676+ return self .dataset .datapath
1677+
1678+ def _download (self , destination : Path ):
1679+ destination .mkdir (parents = True , exist_ok = True )
1680+ (destination / "data.tsv" ).write_text ("col1\t col2\n " )
1681+ self ._download_called = True
1682+
1683+
1684+ class TestRedownloadFolderAtDatapath :
1685+ """Regression: single-resource FolderResource whose path == datapath.
1686+
1687+ When resource.path is the same as dataset.datapath, the directory
1688+ always exists (it holds .state.json), so the "COMPLETE but files
1689+ missing" check must look deeper than just path.exists().
1690+ """
1691+
1692+ def test_redownload_when_content_missing (self , dataset ):
1693+ """COMPLETE folder resource at datapath re-downloads when empty."""
1694+ r = FolderAtDatapath ()
1695+ r .bind ("DATA" , dataset )
1696+
1697+ dataset .ordered_resources = [r ]
1698+ _compute_dependents (dataset .resources )
1699+
1700+ # Simulate: state is COMPLETE but actual content is gone
1701+ # (datapath exists because .state.json is stored there)
1702+ dataset .datapath .mkdir (parents = True , exist_ok = True )
1703+ r .state = ResourceState .COMPLETE
1704+ assert r .path .exists () # datapath exists (has .state.json)
1705+ assert not (r .path / "data.tsv" ).exists () # but no data files
1706+
1707+ dataset .download ()
1708+
1709+ assert r ._download_called is True
1710+ assert r .state == ResourceState .COMPLETE
1711+ assert (r .path / "data.tsv" ).exists ()
1712+
1713+ def test_no_redownload_when_content_present (self , dataset ):
1714+ """COMPLETE folder resource at datapath is NOT re-downloaded."""
1715+ r = FolderAtDatapath ()
1716+ r .bind ("DATA" , dataset )
1717+
1718+ dataset .ordered_resources = [r ]
1719+ _compute_dependents (dataset .resources )
1720+
1721+ # Simulate: state is COMPLETE and actual content is present
1722+ dataset .datapath .mkdir (parents = True , exist_ok = True )
1723+ (dataset .datapath / "data.tsv" ).write_text ("existing\n " )
1724+ r .state = ResourceState .COMPLETE
1725+
1726+ dataset .download ()
1727+
1728+ assert r ._download_called is False
1729+ assert r .state == ResourceState .COMPLETE
1730+
1731+ def _make_transient_dag (self , dataset ):
1732+ """Helper: transient source -> dependent file resource."""
1733+ source = DummyFolderResource (transient = True )
1734+ source .bind ("SOURCE" , dataset )
1735+
1736+ dependent = DummyFileResource ("result.txt" )
1737+ dependent ._dependencies = [source ]
1738+ dependent .bind ("RESULT" , dataset )
1739+
1740+ dataset .ordered_resources = [source , dependent ]
1741+ _compute_dependents (dataset .resources )
1742+ return source , dependent
1743+
1744+ def test_transient_first_download (self , dataset ):
1745+ """First prep: transient source is downloaded then cleaned up."""
1746+ source , dependent = self ._make_transient_dag (dataset )
1747+
1748+ dataset .download ()
1749+
1750+ # Both should have been downloaded
1751+ assert source ._download_called is True
1752+ assert dependent ._download_called is True
1753+ assert dependent .path .exists ()
1754+ assert dependent .state == ResourceState .COMPLETE
1755+
1756+ # Transient source was cleaned up after dependent completed
1757+ assert source .state == ResourceState .NONE
1758+ assert not source .path .exists ()
1759+
1760+ def test_transient_second_download_all_ok (self , dataset ):
1761+ """Second prep: transient source is skipped (dependent is COMPLETE)."""
1762+ source , dependent = self ._make_transient_dag (dataset )
1763+
1764+ # First download
1765+ dataset .download ()
1766+ assert source ._download_called is True
1767+ assert dependent ._download_called is True
1768+
1769+ # Reset call flags
1770+ source ._download_called = False
1771+ dependent ._download_called = False
1772+
1773+ # Second download — everything is already done
1774+ dataset .download ()
1775+
1776+ # Transient source skipped (all dependents COMPLETE)
1777+ assert source ._download_called is False
1778+ # Dependent still COMPLETE, not re-downloaded
1779+ assert dependent ._download_called is False
1780+ assert dependent .state == ResourceState .COMPLETE
1781+
1782+ def test_transient_second_download_after_first_failure (self , dataset ):
1783+ """Second prep after first failed: transient re-downloads."""
1784+ source = DummyFolderResource (transient = True )
1785+ source .bind ("SOURCE" , dataset )
1786+
1787+ # Use a resource that will fail on first attempt
1788+ dependent = FailingResource ("result.txt" )
1789+ dependent ._dependencies = [source ]
1790+ dependent .bind ("RESULT" , dataset )
1791+
1792+ dataset .ordered_resources = [source , dependent ]
1793+ _compute_dependents (dataset .resources )
1794+
1795+ # First download — source succeeds, dependent fails
1796+ result = dataset .download ()
1797+ assert result is False
1798+ assert source ._download_called is True
1799+ # Source is COMPLETE but NOT cleaned up (dependent not COMPLETE)
1800+ assert source .state == ResourceState .COMPLETE
1801+ assert dependent .state == ResourceState .NONE
1802+
1803+ # Now replace the failing resource with a working one for retry
1804+ source ._download_called = False
1805+ good_dependent = DummyFileResource ("result.txt" )
1806+ good_dependent ._dependencies = [source ]
1807+ # Re-bind: remove old, add new
1808+ del dataset .resources ["RESULT" ]
1809+ good_dependent .bind ("RESULT" , dataset )
1810+ dataset .ordered_resources = [source , good_dependent ]
1811+ _compute_dependents (dataset .resources )
1812+
1813+ dataset .download ()
1814+
1815+ # Source is already COMPLETE, not re-downloaded
1816+ assert source ._download_called is False
1817+ # Dependent should now succeed
1818+ assert good_dependent ._download_called is True
1819+ assert good_dependent .state == ResourceState .COMPLETE
1820+ # Transient source cleaned up now that dependent is COMPLETE
1821+ assert source .state == ResourceState .NONE
0 commit comments