diff --git a/.github/requirements-earliest.txt b/.github/requirements-earliest.txt index cc72e71..4d0d2ee 100644 --- a/.github/requirements-earliest.txt +++ b/.github/requirements-earliest.txt @@ -1 +1,4 @@ mercurial==5.2 + +# Required for git_lfs_importer plugin +pathspec==0.11.2 \ No newline at end of file diff --git a/.github/requirements-latest.txt b/.github/requirements-latest.txt index 3620838..4ea702e 100644 --- a/.github/requirements-latest.txt +++ b/.github/requirements-latest.txt @@ -1,2 +1,4 @@ mercurial +# Required for git_lfs_importer plugin +pathspec==0.12.1 \ No newline at end of file diff --git a/README.md b/README.md index 9bc113e..8fbf64e 100644 --- a/README.md +++ b/README.md @@ -141,12 +141,48 @@ if [ "$3" == "1" ]; then cat; else dos2unix -q; fi Mercurial Largefiles Extension ------------------------------ -Mercurial largefiles are exported as ordinary files into git, i.e. not -as git lfs files. In order to make the export work, make sure that -you have all largefiles of all mercurial commits available locally. -This can be ensured by either cloning the mercurial repository with -the option --all-largefiles or by executing the command -'hg lfpull --rev "all()"' inside the mercurial repository. +### Handling Mercurial Largefiles during Migration + +When migrating from Mercurial to Git, largefiles are exported as ordinary +files by default. To ensure a successful migration and manage repository +size, follow the requirements below. + +#### 1. Pre-Export: Ensure File Availability + +Before starting the export, you must have all largefiles from all +Mercurial commits available locally. Use one of these methods: + +* **For a new clone:** `hg clone --all-largefiles ` +* **For an existing repo:** `hg lfpull --rev "all()"` + +#### 2. Choosing Your LFS Strategy + +If you want your files to be versioned in Git LFS rather than as standard +Git blobs, you have two primary paths: + +* **[git_lfs_importer plugin](./plugins/git_lfs_importer/README.md) + (During Conversion)** + Recommended for large repos. This performs Just-In-Time (JIT) conversion + by identifying large files during the export and writing LFS pointers + immediately, skipping the need for a second pass. This also supports + **incremental conversion**, making it much more efficient for ongoing + migrations. +* **[git lfs migrate import](https://github.com/git-lfs/git-lfs/blob/main/docs/man/git-lfs-migrate.adoc) + (After Conversion)** + A standard two-step process: first, export the full history from Mercurial + to Git, then run a separate full history rewrite to move files into LFS. + +### Why use the git_lfs_importer plugin? + +For "monorepos" or very large repositories (100GiB+), the traditional +two-step process can take days. By integrating the LFS conversion +directly into the history export, the plugin eliminates the massive +time overhead of a secondary history rewrite and allows for incremental +progress. + +For detailed setup, see the +[git_lfs_importer](./plugins/git_lfs_importer/README.md) +plugin documentation. Plugins ----------------- @@ -177,9 +213,18 @@ defined filter methods in the [dos2unix](./plugins/dos2unix) and [branch_name_in_commit](./plugins/branch_name_in_commit) plugins. ``` -commit_data = {'branch': branch, 'parents': parents, 'author': author, 'desc': desc, 'revision': revision, 'hg_hash': hg_hash, 'committer': 'committer', 'extra': extra} +commit_data = { + 'author': author, + 'branch': branch, + 'committer': 'committer', + 'desc': desc, + 'extra': extra, + 'hg_hash': hg_hash, + 'parents': parents, + 'revision': revision, +} -def commit_message_filter(self,commit_data): +def commit_message_filter(self, commit_data): ``` The `commit_message_filter` method is called for each commit, after parsing from hg, but before outputting to git. The dictionary `commit_data` contains the @@ -188,9 +233,14 @@ values in the dictionary after filters have been run are used to create the git commit. ``` -file_data = {'filename':filename,'file_ctx':file_ctx,'data':file_contents, 'is_largefile':largefile_status} +file_data = { + 'data': file_contents, + 'file_ctx': file_ctx, + 'filename': filename, + 'is_largefile': largefile_status, +} -def file_data_filter(self,file_data): +def file_data_filter(self, file_data): ``` The `file_data_filter` method is called for each file within each commit. The dictionary `file_data` contains the above attributes about the file, and diff --git a/hg-fast-export.py b/hg-fast-export.py index 9405468..c5f6f5d 100755 --- a/hg-fast-export.py +++ b/hg-fast-export.py @@ -284,7 +284,7 @@ def strip_leading_slash(filename): def export_commit(ui,repo,revision,old_marks,max,count,authors, branchesmap,sob,brmap,hgtags,encoding='',fn_encoding='', - plugins={}): + first_commit_hash="",plugins={}): def get_branchname(name): if name in brmap: return brmap[name] @@ -332,6 +332,9 @@ def export_commit(ui,repo,revision,old_marks,max,count,authors, if not parents: type='full' + if revision == 0 and first_commit_hash: + wr(b'from %s' % first_commit_hash.encode()) + type='simple delta' else: wr(b'from %s' % revnum_to_revref(parents[0], old_marks)) if len(parents) == 1: @@ -526,7 +529,8 @@ def verify_heads(ui,repo,cache,force,ignore_unnamed_heads,branchesmap): def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile, authors={},branchesmap={},tagsmap={}, - sob=False,force=False,ignore_unnamed_heads=False,hgtags=False,notes=False,encoding='',fn_encoding='', + sob=False,force=False,ignore_unnamed_heads=False,hgtags=False, + notes=False,encoding='',fn_encoding='',first_commit_hash='', plugins={}): def check_cache(filename, contents): if len(contents) == 0: @@ -582,7 +586,7 @@ def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile, brmap={} for rev in range(min,max): c=export_commit(ui,repo,rev,old_marks,max,c,authors,branchesmap, - sob,brmap,hgtags,encoding,fn_encoding, + sob,brmap,hgtags,encoding,fn_encoding,first_commit_hash, plugins) if notes: for rev in range(min,max): @@ -656,6 +660,8 @@ if __name__=='__main__': help="Add a plugin with the given init string ") parser.add_option("--subrepo-map", type="string", dest="subrepo_map", help="Provide a mapping file between the subrepository name and the submodule name") + parser.add_option("--first-commit-hash", type="string", dest="first_commit_hash", + help="Allow importing into an existing git repository by specifying the hash of the first commit") (options,args)=parser.parse_args() @@ -735,4 +741,5 @@ if __name__=='__main__': ignore_unnamed_heads=options.ignore_unnamed_heads, hgtags=options.hgtags, notes=options.notes,encoding=encoding,fn_encoding=fn_encoding, + first_commit_hash=options.first_commit_hash, plugins=plugins_dict)) diff --git a/hg-fast-export.sh b/hg-fast-export.sh index 7d6d7b7..30e8ee1 100755 --- a/hg-fast-export.sh +++ b/hg-fast-export.sh @@ -87,6 +87,8 @@ Options: with as arguments --plugin Add a plugin with the given init string (repeatable) --plugin-path Add an additional plugin lookup path + --first-commit-hash Use the given git commit hash as the + first commit's parent (for grafting) " case "$1" in -h|--help) diff --git a/plugins/git_lfs_importer/README.md b/plugins/git_lfs_importer/README.md new file mode 100644 index 0000000..e635748 --- /dev/null +++ b/plugins/git_lfs_importer/README.md @@ -0,0 +1,218 @@ +# git_lfs_importer Plugin + +This plugin automatically converts matching files to use Git LFS +(Large File Storage) during the Mercurial to Git conversion process. + +## Overview + +The git_lfs_importer plugin intercepts file data during the hg-fast-export +process and converts files matching specified patterns into Git LFS pointers. +This allows you to seamlessly migrate a Mercurial repository to Git while +simultaneously adopting LFS for large files. + +Why use git_lfs_importer? +For large repositories, traditional migration requires two sequential, +long-running steps: + + 1. Full history conversion from Mercurial to Git. + 2. Full history rewrite using git lfs import. + +This two-step process can take hours or even days for massive +monorepos (e.g., 100GiB+). + +This plugin eliminates the second, time-consuming history rewrite. It performs +the LFS conversion incrementally (Just-In-Time). During the initial export, the +plugin identifies large files and immediately writes LFS pointers into the Git +history. This results in significantly faster conversions and allows for +efficient incremental imports of new changesets. + +## Prerequisites + +### Dependencies + +This plugin requires the `pathspec` package: + +```bash +pip install pathspec +``` + +### Git Repository Setup + +The destination Git repository must be pre-initialized with: + +1. A `.gitattributes` file configured for LFS tracking +2. Git LFS properly installed and initialized + +Example `.gitattributes`: +``` +*.bin filter=lfs diff=lfs merge=lfs -text +*.iso filter=lfs diff=lfs merge=lfs -text +large_files/** filter=lfs diff=lfs merge=lfs -text +``` + +## Usage + +### Step 1: Create the Destination Git Repository + +```bash +# Create a new git repository +git init my-repo +cd my-repo + +# Initialize Git LFS +git lfs install + +# Create and commit a .gitattributes file +cat > .gitattributes << EOF +*.bin binary diff=lfs merge=lfs -text +*.iso binary diff=lfs merge=lfs -text +EOF +git add .gitattributes +git commit -m "Initialize Git LFS configuration" + +# Get the commit hash (needed for --first-commit-hash) +git rev-parse HEAD +``` + +### Step 2: Create an LFS Specification File + +Create a file (e.g., `lfs-spec.txt`) listing the patterns of files to convert +to LFS. This uses gitignore-style glob patterns: + +``` +*.bin +*.iso +*.tar.gz +large_files/** +*.mp4 +``` + +### Step 3: Run hg-fast-export with the Plugin + +```bash +hg-fast-export.sh \ + -r \ + --plugin git_lfs_importer=lfs-spec.txt \ + --first-commit-hash \ + --force +``` + +Replace `` with the hash obtained from Step 1. + +## How It Works + +1. **Pattern Matching**: Files are matched against patterns in the + LFS specification file using gitignore-style matching +2. **File Processing**: For each matching file: + - Calculates SHA256 hash of the file content + - Stores the actual file content in `.git/lfs/objects//` + - Replaces the file data with an LFS pointer containing: + - LFS version specification + - SHA256 hash of the original content + - Original file size +3. **Git Fast-Import**: The LFS pointer is committed instead of the actual + file content + +## Important Notes + +### First Commit Hash Requirement + +The `--first-commit-hash` option must be provided with the Git commit hash that +contains your `.gitattributes` file. This allows the plugin to chain from the +existing Git history rather than creating a completely new history. + +### Deletions + +The plugin safely handles file deletions (data=None) and does not process them. + +### Large Files and Largefiles + +If the Mercurial repository uses Mercurial's largefiles extension, those files +are already converted to their original content before reaching this plugin, +allowing the plugin to apply LFS conversion if they match the patterns. + +## Example Workflow + +```bash +# Configuration variables +HG_REPO=/path/to/mercurial/repo +GIT_DIR_NAME=my-project-git +LFS_PATTERN_FILE=../lfs-patterns.txt + +# 1. Prepare destination git repo +mkdir "$GIT_DIR_NAME" +cd "$GIT_DIR_NAME" +git init +git lfs install + +# Create .gitattributes +cat > .gitattributes << EOF +*.bin filter=lfs diff=lfs merge=lfs -text +*.iso filter=lfs diff=lfs merge=lfs -text +EOF + +git add .gitattributes +git commit -m "Add LFS configuration" +FIRST_HASH=$(git rev-parse HEAD) + +# 2. Create LFS patterns file +cat > "$LFS_PATTERN_FILE" << EOF +*.bin +*.iso +build/artifacts/** +EOF + +# 3. Run conversion +/path/to/hg-fast-export.sh \ + -r "$HG_REPO" \ + --plugin "git_lfs_importer=$LFS_PATTERN_FILE" \ + --first-commit-hash $FIRST_HASH \ + --force + +# 4. Verify +git log --oneline +git lfs ls-files +``` + +## Troubleshooting + +### LFS Files Not Tracked +Verify that: +- The `.gitattributes` file exists in the destination repository +- Patterns in `.gitattributes` match the files being converted +- `git lfs install` was run in the repository + +### "pathspec" Module Not Found +Install the required dependency: +```bash +pip install pathspec +``` + +### Conversion Fails at Import +Ensure the `--first-commit-hash` value is: +- A valid commit hash in the destination repository +- From a commit that exists before the conversion starts +- The hash of the commit containing `.gitattributes` + + +### Force Requirement + +You only need to pass the `--force` option when converting the *first* +Mercurial commit into a non-empty Git repository. By default, `hg-fast-export` +prevents importing Mercurial commits onto a non-empty Git repo to avoid +creating conflicting histories. Passing `--force` overrides that safety check +and allows the exporter to write the LFS pointer objects and integrate the +converted data with the existing Git history. + +If you are doing an incremental conversion (i.e., running the script a second +time to import new changesets into an already converted repository), +the --force flag is not required. + +Omitting `--force` when attempting to import the first Mercurial commit into a +non-empty repository will cause the importer to refuse the operation. + +## See Also + +- [Git LFS Documentation](https://git-lfs.github.com/) +- [gitignore Pattern Format](https://git-scm.com/docs/gitignore) +- [hg-fast-export Documentation](../README.md) diff --git a/plugins/git_lfs_importer/__init__.py b/plugins/git_lfs_importer/__init__.py new file mode 100644 index 0000000..7d9efd9 --- /dev/null +++ b/plugins/git_lfs_importer/__init__.py @@ -0,0 +1,49 @@ +import pathlib +import hashlib +import pathspec + + +def build_filter(args): + with open(args) as f: + lfs_spec = pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, f) + return Filter(lfs_spec) + + +class Filter: + def __init__(self, lfs_spec): + self.lfs_spec = lfs_spec + + def file_data_filter(self, file_data): + """ + file_data: { + 'filename': , + 'file_ctx': , + 'data': , + 'is_largefile': + } + + May be called for deletions (data=None, file_ctx=None). + """ + filename = file_data.get('filename') + data = file_data.get('data') + + # Skip deletions or filtered files early + if data is None or not self.lfs_spec.match_file(filename.decode("utf-8")): + return + + # Get the file path + sha256hash = hashlib.sha256(data).hexdigest() + lfs_path = pathlib.Path(f".git/lfs/objects/{sha256hash[0:2]}/{sha256hash[2:4]}") + lfs_path.mkdir(parents=True, exist_ok=True) + lfs_file_path = lfs_path / sha256hash + + # The binary blob is already in LFS + if not lfs_file_path.is_file(): + (lfs_path / sha256hash).write_bytes(data) + + # Write the LFS pointer + file_data['data'] = ( + f"version https://git-lfs.github.com/spec/v1\n" + f"oid sha256:{sha256hash}\n" + f"size {len(data)}\n" + ).encode("utf-8") diff --git a/t/first_commit_hash_option.t b/t/first_commit_hash_option.t new file mode 100755 index 0000000..8df44d0 --- /dev/null +++ b/t/first_commit_hash_option.t @@ -0,0 +1,117 @@ +#!/bin/bash +# +# Copyright (c) 2025 +# + +test_description='git_lfs_importer plugin integration tests' + +. "${SHARNESS_TEST_SRCDIR-$(dirname "$0")/sharness}"/sharness.sh || exit 1 + +setup() { + cat > "$HOME"/.hgrc <<-EOF + [ui] + username = Test User + EOF + + # Git config for the destination repo commits + git config --global user.email "test@example.com" + git config --global user.name "Test User" +} + +setup + +test_expect_success 'Mercurial history is imported over the provided commit' ' + test_when_finished "rm -rf hgrepo gitrepo lfs-patterns.txt" && + + # 1. Create source Mercurial repository with binary files + ( + hg init hgrepo && + cd hgrepo && + echo "regular text file" > readme.txt && + hg add readme.txt && + hg commit -m "initial commit" + ) && + + # 2. Prepare destination git repo with LFS setup + mkdir gitrepo && + ( + cd gitrepo && + git init -q && + git config core.ignoreCase false && + git lfs install --local && + git switch --create master && + + cat > .gitattributes <<-EOF && + * -text + EOF + + git add .gitattributes && + git commit -q -m "Initialize Git configuration" + ) && + + FIRST_HASH=$(git -C gitrepo rev-parse HEAD) && + + # 3. Run hg-fast-export + ( + cd gitrepo && + hg-fast-export.sh \ + -r "../hgrepo" \ + --first-commit-hash "$FIRST_HASH" --force \ + -M master + ) && + + # 4. Verify git file is still present + git -C gitrepo show HEAD:.gitattributes > gitattributes_check.txt && + test "$(cat gitattributes_check.txt)" = "* -text" && + + # 5. Verify hg file is imported + git -C gitrepo show HEAD:readme.txt > readme_check.txt && + test "$(cat readme_check.txt)" = "regular text file" +' + +test_expect_success 'Mercurial history has priority over git' ' + test_when_finished "rm -rf hgrepo gitrepo lfs-patterns.txt" && + + # 1. Create source Mercurial repository with binary files + ( + hg init hgrepo && + cd hgrepo && + echo "hg readme file" > readme.txt && + hg add readme.txt && + hg commit -m "initial commit" + ) && + + # 2. Prepare destination git repo with LFS setup + mkdir gitrepo && + ( + cd gitrepo && + git init -q && + git config core.ignoreCase false && + git lfs install --local && + git switch --create master && + + cat > readme.txt <<-EOF && + git readme file + EOF + + git add readme.txt && + git commit -q -m "Initialize Git readme file" + ) && + + FIRST_HASH=$(git -C gitrepo rev-parse HEAD) && + + # 3. Run hg-fast-export + ( + cd gitrepo && + hg-fast-export.sh \ + -r "../hgrepo" \ + --first-commit-hash "$FIRST_HASH" --force \ + -M master + ) && + + # 5. Verify hg file is imported + git -C gitrepo show HEAD:readme.txt > readme_check.txt && + test "$(cat readme_check.txt)" = "hg readme file" +' + +test_done diff --git a/t/git_lfs_importer_plugin.t b/t/git_lfs_importer_plugin.t new file mode 100755 index 0000000..b0ff5d9 --- /dev/null +++ b/t/git_lfs_importer_plugin.t @@ -0,0 +1,189 @@ +#!/bin/bash +# +# Copyright (c) 2025 +# + +test_description='git_lfs_importer plugin integration tests' + +. "${SHARNESS_TEST_SRCDIR-$(dirname "$0")/sharness}"/sharness.sh || exit 1 + +setup() { + cat > "$HOME"/.hgrc <<-EOF + [ui] + username = Test User + EOF + + # Git config for the destination repo commits + git config --global user.email "test@example.com" + git config --global user.name "Test User" +} + +setup + +test_expect_success 'git_lfs_importer converts matched binary files to LFS pointers and pointers are properly smudged when checkouting' ' + test_when_finished "rm -rf hgrepo gitrepo lfs-patterns.txt" && + + # 1. Create source Mercurial repository with binary files + ( + hg init hgrepo && + cd hgrepo && + echo "regular text file" > readme.txt && + echo "binary payload" > payload.bin && + hg add readme.txt payload.bin && + hg commit -m "initial commit with binary" + ) && + + # 2. Prepare destination git repo with LFS setup + mkdir gitrepo && + ( + cd gitrepo && + git init -q && + git config core.ignoreCase false && + git lfs install --local && + + cat > .gitattributes <<-EOF && + *.bin filter=lfs diff=lfs merge=lfs -text + EOF + + git add .gitattributes && + git commit -q -m "Initialize Git LFS configuration" + ) && + + FIRST_HASH=$(git -C gitrepo rev-parse HEAD) && + + # 3. Create LFS patterns file + cat > lfs-patterns.txt <<-EOF && + *.bin + EOF + + # 4. Run hg-fast-export with git_lfs_importer plugin + ( + cd gitrepo && + hg-fast-export.sh \ + -r "../hgrepo" \ + --plugin "git_lfs_importer=../lfs-patterns.txt" \ + --first-commit-hash "$FIRST_HASH" --force + ) && + + # 5. Verify conversion: payload.bin should be an LFS pointer + git -C gitrepo show HEAD:payload.bin > lfs_pointer.txt && + grep -q "version https://git-lfs.github.com/spec/v1" lfs_pointer.txt && + grep -q "oid sha256:" lfs_pointer.txt && + grep -q "size" lfs_pointer.txt && + + # 6. Verify non-matched file is unchanged + git -C gitrepo show HEAD:readme.txt > readme_check.txt && + test "$(cat readme_check.txt)" = "regular text file" && + + # 7. Make sure the LFS pointer file is unsmeared when checked out + git -C gitrepo reset --hard HEAD && + ls gitrepo && + test "$(cat gitrepo/payload.bin)" = "binary payload" +' + +test_expect_success 'git_lfs_importer skips files not matching patterns' ' + test_when_finished "rm -rf hgrepo gitrepo lfs-patterns.txt" && + + # 1. Create source with various files + ( + hg init hgrepo && + cd hgrepo && + echo "text" > file.txt && + echo "data" > file.dat && + echo "iso content" > image.iso && + hg add . && + hg commit -m "multiple files" + ) && + + # 2. Prepare git repo with LFS + mkdir gitrepo && + ( + cd gitrepo && + git init -q && + git config core.ignoreCase false && + git lfs install --local && + + cat > .gitattributes <<-EOF && + *.iso filter=lfs diff=lfs merge=lfs -text + EOF + + git add .gitattributes && + git commit -q -m "Initialize Git LFS configuration" + ) && + + FIRST_HASH=$(git -C gitrepo rev-parse HEAD) && + + # 3. Only .iso files should be converted + cat > lfs-patterns.txt <<-EOF && + *.iso + EOF + + ( + cd gitrepo && + hg-fast-export.sh \ + -r "../hgrepo" \ + --plugin "git_lfs_importer=../lfs-patterns.txt" \ + --first-commit-hash "$FIRST_HASH" --force + ) && + + # 4. Verify .iso is LFS pointer + git -C gitrepo show HEAD:image.iso | grep -q "oid sha256:" && + + # 5. Verify .txt and .dat are unchanged + test "$(git -C gitrepo show HEAD:file.txt)" = "text" && + test "$(git -C gitrepo show HEAD:file.dat)" = "data" +' + +test_expect_success 'git_lfs_importer handles directory patterns' ' + test_when_finished "rm -rf hgrepo gitrepo lfs-patterns.txt" && + + # 1. Create repo with files in directory + ( + hg init hgrepo && + cd hgrepo && + mkdir -p assets/images && + echo "logo data" > assets/images/logo.bin && + echo "regular" > readme.txt && + hg add . && + hg commit -m "files in directories" + ) && + + # 2. Prepare git repo + mkdir gitrepo && + ( + cd gitrepo && + git init -q && + git config core.ignoreCase false && + git lfs install --local && + + cat > .gitattributes <<-EOF && + assets/** filter=lfs diff=lfs merge=lfs -text + EOF + + git add .gitattributes && + git commit -q -m "Initialize Git LFS configuration" + ) && + + FIRST_HASH=$(git -C gitrepo rev-parse HEAD) && + + # 3. Match directory pattern + cat > lfs-patterns.txt <<-EOF && + assets/** + EOF + + ( + cd gitrepo && + hg-fast-export.sh \ + -r "../hgrepo" \ + --plugin "git_lfs_importer=../lfs-patterns.txt" \ + --first-commit-hash "$FIRST_HASH" --force + ) && + + # 4. Verify directory file is converted + git -C gitrepo show HEAD:assets/images/logo.bin | grep -q "oid sha256:" && + + # 5. Verify file outside directory is unchanged + test "$(git -C gitrepo show HEAD:readme.txt)" = "regular" +' + +test_done diff --git a/tests/test_git_lfs_importer_plugin.py b/tests/test_git_lfs_importer_plugin.py new file mode 100644 index 0000000..92ac04c --- /dev/null +++ b/tests/test_git_lfs_importer_plugin.py @@ -0,0 +1,156 @@ +import sys + +sys.path.append("./plugins") + +import hashlib +import pathlib +import time +import unittest +import tempfile +import os +import pathspec + +from git_lfs_importer import Filter, build_filter + + +class TestGitLfsImporterPlugin(unittest.TestCase): + def setUp(self): + # create an isolated temp dir and chdir into it for each test + self._orig_cwd = os.getcwd() + self._tmpdir = tempfile.TemporaryDirectory() + self.tmp_path = pathlib.Path(self._tmpdir.name) + os.chdir(self.tmp_path) + + def tearDown(self): + # restore cwd and cleanup + os.chdir(self._orig_cwd) + self._tmpdir.cleanup() + + def empty_spec(self): + return pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, []) + + # -------------------------------------------------------- + # GIVEN-WHEN-THEN TESTS for Filter.file_data_filter + # -------------------------------------------------------- + + def test_skips_deletions(self): + flt = Filter(self.empty_spec()) + file_data = {"filename": b"file.txt", "data": None} + + flt.file_data_filter(file_data) + + self.assertIsNone(file_data["data"]) + self.assertFalse((self.tmp_path / ".git").exists()) + + def test_skips_files_that_do_not_match_spec(self): + spec = pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, ["*.bin"]) + flt = Filter(spec) + original = b"not matched" + file_data = {"filename": b"file.txt", "data": original} + + flt.file_data_filter(file_data) + + self.assertEqual(file_data["data"], original) + self.assertFalse((self.tmp_path / ".git").exists()) + + def test_converts_only_matched_files_to_lfs_pointer(self): + spec = pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, ["*.bin"]) + flt = Filter(spec) + data = b"hello world" + sha = hashlib.sha256(data).hexdigest() + expected_pointer = ( + f"version https://git-lfs.github.com/spec/v1\n" + f"oid sha256:{sha}\n" + f"size {len(data)}\n" + ).encode("utf-8") + file_data = {"filename": b"payload.bin", "data": data} + + flt.file_data_filter(file_data) + + self.assertEqual(file_data["data"], expected_pointer) + lfs_file = pathlib.Path(".git/lfs/objects") / sha[:2] / sha[2:4] / sha + self.assertTrue(lfs_file.is_file()) + self.assertEqual(lfs_file.read_bytes(), data) + + def test_does_not_convert_unmatched_directory(self): + spec = pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, ["assets/**"]) + flt = Filter(spec) + data = b"outside directory" + file_data = {"filename": b"src/images/logo.png", "data": data} + + flt.file_data_filter(file_data) + + self.assertEqual(file_data["data"], data) + self.assertFalse((self.tmp_path / ".git").exists()) + + def test_converts_matched_directory(self): + spec = pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, ["assets/**"]) + flt = Filter(spec) + data = b"inside directory" + sha = hashlib.sha256(data).hexdigest() + file_data = {"filename": b"assets/images/logo.png", "data": data} + + flt.file_data_filter(file_data) + + self.assertIn(b"version https://git-lfs.github.com/spec/v1", file_data["data"]) + lfs_file = pathlib.Path(".git/lfs/objects") / sha[:2] / sha[2:4] / sha + self.assertTrue(lfs_file.is_file()) + self.assertEqual(lfs_file.read_bytes(), data) + + def test_does_not_overwrite_existing_blob(self): + spec = pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, ["*.bin"]) + flt = Filter(spec) + data = b"abc" + sha = hashlib.sha256(data).hexdigest() + lfs_dir = pathlib.Path(".git/lfs/objects") / sha[:2] / sha[2:4] + lfs_dir.mkdir(parents=True, exist_ok=True) + lfs_file = lfs_dir / sha + lfs_file.write_bytes(data) + before_mtime = lfs_file.stat().st_mtime_ns + time.sleep(0.01) # Ensure timestamp difference + + file_data = {"filename": b"abc.bin", "data": data} + + flt.file_data_filter(file_data) + + expected_pointer_prefix = b"version https://git-lfs.github.com/spec/v1" + self.assertTrue(file_data["data"].startswith(expected_pointer_prefix)) + after_mtime = lfs_file.stat().st_mtime_ns + self.assertEqual(after_mtime, before_mtime) + + def test_empty_file_converted_when_matched(self): + spec = pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, ["*.bin"]) + flt = Filter(spec) + data = b"" + sha = hashlib.sha256(data).hexdigest() + file_data = {"filename": b"empty.bin", "data": data} + + flt.file_data_filter(file_data) + + self.assertIn(b"size 0", file_data["data"]) + lfs_file = pathlib.Path(".git/lfs/objects") / sha[:2] / sha[2:4] / sha + self.assertTrue(lfs_file.is_file()) + self.assertEqual(lfs_file.read_bytes(), data) + + # -------------------------------------------------------- + # Optional: GIVEN-WHEN-THEN for build_filter + # -------------------------------------------------------- + + def test_build_filter_reads_patterns_file(self): + patterns_file = self.tmp_path / "lfs_patterns.txt" + patterns_file.write_text("*.bin\nassets/**\n", encoding="utf-8") + + flt = build_filter(str(patterns_file)) + + data_match = b"match me" + sha_match = hashlib.sha256(data_match).hexdigest() + fd_match = {"filename": b"assets/payload.bin", "data": data_match} + flt.file_data_filter(fd_match) + self.assertIn(b"oid sha256:", fd_match["data"]) + lfs_file = pathlib.Path(".git/lfs/objects") / sha_match[:2] / sha_match[2:4] / sha_match + self.assertTrue(lfs_file.is_file()) + + data_skip = b"skip me" + fd_skip = {"filename": b"docs/readme.md", "data": data_skip} + flt.file_data_filter(fd_skip) + self.assertEqual(fd_skip["data"], data_skip)