From 98320356146739154693d37eb906880d9f2ebf5a Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Tue, 6 Mar 2007 17:00:25 +0000 Subject: [PATCH 01/38] Initial import This is the initial import of 'hg2git' being a converter which feeds a hg repository into git-fast-import(1). --- hg2git.py | 230 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ hg2git.sh | 76 ++++++++++++++++++ 2 files changed, 306 insertions(+) create mode 100644 hg2git.py create mode 100755 hg2git.sh diff --git a/hg2git.py b/hg2git.py new file mode 100644 index 0000000..0bbbe03 --- /dev/null +++ b/hg2git.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python + +# Copyright (c) 2007 Rocco Rutte +# License: GPLv2 + +"""hg2git.py - A mercurial-to-git filter for git-fast-import(1) +Usage: hg2git.py +""" + +from mercurial import repo,hg,cmdutil,util,ui,revlog +from tempfile import mkstemp +import re +import sys +import os + +# silly regex to see if user field has email address +user_re=re.compile('[^<]+ <[^>]+>$') +# git branch for hg's default 'HEAD' branch +cfg_master='master' +# insert 'checkpoint' command after this many commits +cfg_checkpoint_count=1000 + +def usage(ret): + sys.stderr.write(__doc__) + return ret + +def setup_repo(url): + myui=ui.ui() + return myui,hg.repository(myui,url) + +def get_changeset(ui,repo,revision): + def get_branch(name): + if name=='HEAD': + name=cfg_master + return name + def fixup_user(user): + if user_re.match(user)==None: + if '@' not in user: + return user+' ' + return user+' <'+user+'>' + return user + node=repo.lookup(revision) + (manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node) + tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60)) + branch=get_branch(extra.get('branch','master')) + return (manifest,fixup_user(user),(time,tz),files,desc,branch,extra) + +def gitmode(x): + return x and '100755' or '100644' + +def wr(msg=''): + print msg + #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n')) + +def checkpoint(count): + count=count+1 + if count%cfg_checkpoint_count==0: + sys.stderr.write("Checkpoint after %d commits\n" % count) + wr('checkpoint') + wr() + return count + +def get_parent_mark(parent,marks): + p=marks.get(str(parent),None) + if p==None: + # if we didn't see parent previously, assume we saw it in this run + p=':%d' % (parent+1) + return p + +def export_commit(ui,repo,revision,marks,heads,last,max,count): + sys.stderr.write('Exporting revision %d (tip %d) as [:%d]\n' % (revision,max,revision+1)) + + (_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision) + parents=repo.changelog.parentrevs(revision) + + # we need this later to write out tags + marks[str(revision)]=':%d'%(revision+1) + + wr('commit refs/heads/%s' % branch) + wr('mark :%d' % (revision+1)) + wr('committer %s %d %s' % (user,time,timezone)) + wr('data %d' % (len(desc)+1)) # wtf? + wr(desc) + wr() + + src=heads.get(branch,'') + link='' + if src!='': + # if we have a cached head, this is an incremental import: initialize it + # and kill reference so we won't init it again + wr('from %s' % src) + heads[branch]='' + elif not heads.has_key(branch) and revision>0: + # newly created branch and not the first one: connect to parent + tmp=get_parent_mark(parents[0],marks) + wr('from %s' % tmp) + sys.stderr.write('Link new branch [%s] to parent [%s]\n' % + (branch,tmp)) + link=tmp # avoid making a merge commit for branch fork + + if parents: + l=last.get(branch,revision) + for p in parents: + # 1) as this commit implicitely is the child of the most recent + # commit of this branch, ignore this parent + # 2) ignore nonexistent parents + # 3) merge otherwise + if p==l or p==revision or p<0: + continue + tmp=get_parent_mark(p,marks) + # if we fork off a branch, don't merge via 'merge' as we have + # 'from' already above + if tmp==link: + continue + sys.stderr.write('Merging branch [%s] with parent [%s] from [r%d]\n' % + (branch,tmp,p)) + wr('merge %s' % tmp) + + last[branch]=revision + heads[branch]='' + + ctx=repo.changectx(str(revision)) + man=ctx.manifest() + + wr('deleteall') + + for f in man.keys(): + fctx=ctx.filectx(f) + d=fctx.data() + wr('M %s inline %s' % (gitmode(man.execf(f)),f)) + wr('data %d' % len(d)) # had some trouble with size() + wr(d) + + wr() + return checkpoint(count) + +def export_tags(ui,repo,cache,count): + l=repo.tagslist() + for tag,node in l: + if tag=='tip': + continue + rev=repo.changelog.rev(node) + ref=cache.get(str(rev),None) + if ref==None: + sys.stderr.write('Failed to find reference for creating tag' + ' %s at r%d\n' % (tag,rev)) + continue + (_,user,(time,timezone),_,desc,branch,_)=get_changeset(ui,repo,rev) + sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref)) + wr('tag %s' % tag) + wr('from %s' % ref) + wr('tagger %s %d %s' % (user,time,timezone)) + msg='hg2git created tag %s for hg revision %d on branch %s on (summary):\n\t%s' % (tag, + rev,branch,desc.split('\n')[0]) + wr('data %d' % (len(msg)+1)) + wr(msg) + wr() + count=checkpoint(count) + return count + +def load_cache(filename): + cache={} + if not os.path.exists(filename): + return cache + f=open(filename,'r') + l=0 + for line in f.readlines(): + l+=1 + fields=line.split(' ') + if fields==None or not len(fields)==2 or fields[0][0]!=':': + sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l)) + continue + # put key:value in cache, key without ^: + cache[fields[0][1:]]=fields[1].split('\n')[0] + f.close() + return cache + +def save_cache(filename,cache): + f=open(filename,'w+') + map(lambda x: f.write(':%s %s\n' % (str(x),str(cache.get(x)))),cache.keys()) + f.close() + +def verify_heads(ui,repo,cache): + def getsha1(branch): + f=open(os.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch) + sha1=f.readlines()[0].split('\n')[0] + f.close() + return sha1 + + for b in cache.keys(): + sys.stderr.write('Verifying branch [%s]\n' % b) + sha1=getsha1(b) + c=cache.get(b) + if sha1!=c: + sys.stderr.write('Warning: Branch [%s] modified outside hg2git:' + '\n%s (repo) != %s (cache)\n' % (b,sha1,c)) + return True + +if __name__=='__main__': + if len(sys.argv)!=6: sys.exit(usage(1)) + repourl,m,marksfile,headsfile,tipfile=sys.argv[1:] + _max=int(m) + + marks_cache=load_cache(marksfile) + heads_cache=load_cache(headsfile) + state_cache=load_cache(tipfile) + + ui,repo=setup_repo(repourl) + + if not verify_heads(ui,repo,heads_cache): + sys.exit(1) + + tip=repo.changelog.count() + + min=int(state_cache.get('tip',0)) + max=_max + if _max<0: + max=tip + + c=int(state_cache.get('count',0)) + last={} + for rev in range(min,max): + c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,tip,c) + + c=export_tags(ui,repo,marks_cache,c) + + state_cache['tip']=max + state_cache['count']=c + state_cache['repo']=repourl + save_cache(tipfile,state_cache) diff --git a/hg2git.sh b/hg2git.sh new file mode 100755 index 0000000..c51c1d5 --- /dev/null +++ b/hg2git.sh @@ -0,0 +1,76 @@ +#!/bin/sh + +USAGE='[-m max] repo' +LONG_USAGE='Import hg repository up to either tip or ' +ROOT="`dirname $0`" +REPO="" +MAX="-1" +PFX="hg2git" +SFX_MARKS="marks" +SFX_HEADS="heads" +SFX_STATE="state" + +. git-sh-setup +cd_to_toplevel + +while case "$#" in 0) break ;; esac +do + case "$1" in + -m) + shift + MAX="$1" + ;; + -*) + usage + ;; + *) + break + ;; + esac + shift +done + +if [ "$#" != 1 ] ; then + usage + exit 1 +fi + +REPO="$1" + +# make sure we have a marks cache +if [ ! -f "$GIT_DIR/$PFX-$SFX_MARKS" ] ; then + touch "$GIT_DIR/$PFX-$SFX_MARKS" +fi + +GIT_DIR="$GIT_DIR" python "$ROOT/hg2git.py" \ + "$REPO" \ + "$MAX" \ + "$GIT_DIR/$PFX-$SFX_MARKS" \ + "$GIT_DIR/$PFX-$SFX_HEADS" \ + "$GIT_DIR/$PFX-$SFX_STATE" \ +| git-fast-import --export-marks="$GIT_DIR/$PFX-$SFX_MARKS.tmp" \ +|| die 'Git fast-import failed' + +# move recent marks cache out of the way... +if [ -f "$GIT_DIR/$PFX-$SFX_MARKS" ] ; then + mv "$GIT_DIR/$PFX-$SFX_MARKS" "$GIT_DIR/$PFX-$SFX_MARKS.old" +else + touch "$GIT_DIR/$PFX-$SFX_MARKS.old" +fi + +# ...to create a new merged one +cat "$GIT_DIR/$PFX-$SFX_MARKS.old" "$GIT_DIR/$PFX-$SFX_MARKS.tmp" \ +| uniq > "$GIT_DIR/$PFX-$SFX_MARKS" + +# cleanup +rm -rf "$GIT_DIR/$PFX-$SFX_MARKS.old" "$GIT_DIR/$PFX-$SFX_MARKS.tmp" + +# save SHA1s of current heads for incremental imports +# and connectivity (plus sanity checking) +for head in `ls "$GIT_DIR/refs/heads"` ; do + id="`git-rev-parse $head`" + echo ":$head $id" +done > "$GIT_DIR/$PFX-$SFX_HEADS" + +# check diff with color: +# ( for i in `find . -type f | grep -v '\.git'` ; do diff -u $i $REPO/$i ; done | cdiff ) | less -r From 95e06a1f56781a1547c377034b7add7ba5b7ec63 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Tue, 6 Mar 2007 19:46:50 +0000 Subject: [PATCH 02/38] Fix wrapper script for branches with slahes Use git-branch(1) instead of ls(1) for getting branch names. --- hg2git.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hg2git.sh b/hg2git.sh index c51c1d5..ea25f9a 100755 --- a/hg2git.sh +++ b/hg2git.sh @@ -67,7 +67,7 @@ rm -rf "$GIT_DIR/$PFX-$SFX_MARKS.old" "$GIT_DIR/$PFX-$SFX_MARKS.tmp" # save SHA1s of current heads for incremental imports # and connectivity (plus sanity checking) -for head in `ls "$GIT_DIR/refs/heads"` ; do +for head in `git branch | sed 's#^..##'` ; do id="`git-rev-parse $head`" echo ":$head $id" done > "$GIT_DIR/$PFX-$SFX_HEADS" From b702707afc942914f446dab8bb7eaa3d5a47e09c Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Tue, 6 Mar 2007 19:47:51 +0000 Subject: [PATCH 03/38] Provide two versions of getting changes per manifest As suggested on #mercurial, filectxs() may be faster than calling filectx() per file. --- hg2git.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/hg2git.py b/hg2git.py index 0bbbe03..076a80e 100644 --- a/hg2git.py +++ b/hg2git.py @@ -119,13 +119,21 @@ def export_commit(ui,repo,revision,marks,heads,last,max,count): last[branch]=revision heads[branch]='' + # just wipe the branch clean, all full manifest contents + wr('deleteall') + ctx=repo.changectx(str(revision)) man=ctx.manifest() - wr('deleteall') + #for f in man.keys(): + # fctx=ctx.filectx(f) + # d=fctx.data() + # wr('M %s inline %s' % (gitmode(man.execf(f)),f)) + # wr('data %d' % len(d)) # had some trouble with size() + # wr(d) - for f in man.keys(): - fctx=ctx.filectx(f) + for fctx in ctx.filectxs(): + f=fctx.path() d=fctx.data() wr('M %s inline %s' % (gitmode(man.execf(f)),f)) wr('data %d' % len(d)) # had some trouble with size() From 9ebd965f72da2afd028763bdbd48298bcd5f9db7 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Tue, 6 Mar 2007 22:11:10 +0000 Subject: [PATCH 04/38] Link and not merge first branch of incremental import --- hg2git.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hg2git.py b/hg2git.py index 076a80e..b00a82e 100644 --- a/hg2git.py +++ b/hg2git.py @@ -90,6 +90,9 @@ def export_commit(ui,repo,revision,marks,heads,last,max,count): # and kill reference so we won't init it again wr('from %s' % src) heads[branch]='' + sys.stderr.write('Initializing branch [%s] to parent [%s]\n' % + (branch,src)) + link=src # avoid making a merge commit for incremental import elif not heads.has_key(branch) and revision>0: # newly created branch and not the first one: connect to parent tmp=get_parent_mark(parents[0],marks) From 8d433b85c23aff3b43aa2b48520048db9ce50752 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Wed, 7 Mar 2007 01:52:58 +0000 Subject: [PATCH 05/38] Speedup: Use manifest and compare checksums to detect changes Instead of feeding in everything or only something and getting merges wrong, build up a list of changed (incl. added) and deleted files by 1) comparing manifest (deleted, added) 2) comparing checksums if file is present in parent and child (change) The hg-crew and mutt imports now go in <15 minutes and md5 sums match. Thanks to Theodore Tso for the hint. While at it, fix a regression that upon incremental import start we always merged a branch plus initializing it. A single test showed that the new detection get starting off from a merge commit right, too. Signed-off-by: Rocco Rutte --- hg2git.py | 82 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 54 insertions(+), 28 deletions(-) diff --git a/hg2git.py b/hg2git.py index b00a82e..4d67d9b 100644 --- a/hg2git.py +++ b/hg2git.py @@ -7,7 +7,7 @@ Usage: hg2git.py """ -from mercurial import repo,hg,cmdutil,util,ui,revlog +from mercurial import repo,hg,cmdutil,util,ui,revlog,node from tempfile import mkstemp import re import sys @@ -61,21 +61,48 @@ def checkpoint(count): return count def get_parent_mark(parent,marks): - p=marks.get(str(parent),None) - if p==None: - # if we didn't see parent previously, assume we saw it in this run - p=':%d' % (parent+1) - return p + """Get the mark for some parent. + If we saw it in the current session, return :%d syntax and + otherwise the SHA1 from the cache.""" + return marks.get(str(parent+1),':%d' % (parent+1)) + +def mismatch(x,f1,f2): + """See if two revisions of a file are not equal.""" + return node.hex(f1)!=node.hex(f2) + +def outer_set(dleft,dright,l,r): + """Loop over our repository in and find all changed and missing files.""" + for left in dleft.keys(): + right=dright.get(left,None) + if right==None or mismatch('A',dleft[left],right): + # if either have the current file not in parent or the + # checksums differ: add it to changed files + l.append(left) + for right in dright.keys(): + left=dleft.get(right,None) + if left==None: + # if we have a file in the parent but not our manifest, + # add it to deleted files; checksums are checked earlier + r.append(right) + return l,r + +def get_filechanges(repo,revision,parents,mleft): + """Given some repository and revision, find all changed/deleted files.""" + l,r=[],[] + for p in parents: + if p<0: continue + mright=repo.changectx(p).manifest() + dleft=mleft.keys() + dleft.sort() + dright=mright.keys() + dright.sort() + l,r=outer_set(mleft,mright,l,r) + return l,r def export_commit(ui,repo,revision,marks,heads,last,max,count): - sys.stderr.write('Exporting revision %d (tip %d) as [:%d]\n' % (revision,max,revision+1)) - (_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision) parents=repo.changelog.parentrevs(revision) - # we need this later to write out tags - marks[str(revision)]=':%d'%(revision+1) - wr('commit refs/heads/%s' % branch) wr('mark :%d' % (revision+1)) wr('committer %s %d %s' % (user,time,timezone)) @@ -93,7 +120,7 @@ def export_commit(ui,repo,revision,marks,heads,last,max,count): sys.stderr.write('Initializing branch [%s] to parent [%s]\n' % (branch,src)) link=src # avoid making a merge commit for incremental import - elif not heads.has_key(branch) and revision>0: + elif link=='' and not heads.has_key(branch) and revision>0: # newly created branch and not the first one: connect to parent tmp=get_parent_mark(parents[0],marks) wr('from %s' % tmp) @@ -111,8 +138,8 @@ def export_commit(ui,repo,revision,marks,heads,last,max,count): if p==l or p==revision or p<0: continue tmp=get_parent_mark(p,marks) - # if we fork off a branch, don't merge via 'merge' as we have - # 'from' already above + # if we fork off a branch, don't merge with our parent via 'merge' + # as we have 'from' already above if tmp==link: continue sys.stderr.write('Merging branch [%s] with parent [%s] from [r%d]\n' % @@ -121,27 +148,26 @@ def export_commit(ui,repo,revision,marks,heads,last,max,count): last[branch]=revision heads[branch]='' - - # just wipe the branch clean, all full manifest contents - wr('deleteall') + # we need this later to write out tags + marks[str(revision)]=':%d'%(revision+1) ctx=repo.changectx(str(revision)) man=ctx.manifest() + added,removed=get_filechanges(repo,revision,parents,man) - #for f in man.keys(): - # fctx=ctx.filectx(f) - # d=fctx.data() - # wr('M %s inline %s' % (gitmode(man.execf(f)),f)) - # wr('data %d' % len(d)) # had some trouble with size() - # wr(d) + sys.stderr.write('Exporting revision %d with %d changed/%d removed files\n' % + (revision,len(added),len(removed))) - for fctx in ctx.filectxs(): - f=fctx.path() + for a in added: + fctx=ctx.filectx(a) d=fctx.data() - wr('M %s inline %s' % (gitmode(man.execf(f)),f)) + wr('M %s inline %s' % (gitmode(man.execf(a)),a)) wr('data %d' % len(d)) # had some trouble with size() wr(d) + for r in removed: + wr('D %s' % r) + wr() return checkpoint(count) @@ -153,8 +179,8 @@ def export_tags(ui,repo,cache,count): rev=repo.changelog.rev(node) ref=cache.get(str(rev),None) if ref==None: - sys.stderr.write('Failed to find reference for creating tag' - ' %s at r%d\n' % (tag,rev)) + #sys.stderr.write('Failed to find reference for creating tag' + # ' %s at r%d\n' % (tag,rev)) continue (_,user,(time,timezone),_,desc,branch,_)=get_changeset(ui,repo,rev) sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref)) From 2b319f9de4bec43cd49f942bfb992e8a398a2a49 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Wed, 7 Mar 2007 11:06:34 +0000 Subject: [PATCH 06/38] Don't restore checkpoint count from cache It doesn't make sense as each run of git-fast-import starts a new pack anyways. Signed-off-by: Rocco Rutte --- hg2git.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hg2git.py b/hg2git.py index 4d67d9b..8f2ee22 100644 --- a/hg2git.py +++ b/hg2git.py @@ -254,7 +254,7 @@ if __name__=='__main__': if _max<0: max=tip - c=int(state_cache.get('count',0)) + c=0 last={} for rev in range(min,max): c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,tip,c) @@ -262,6 +262,5 @@ if __name__=='__main__': c=export_tags(ui,repo,marks_cache,c) state_cache['tip']=max - state_cache['count']=c state_cache['repo']=repourl save_cache(tipfile,state_cache) From cdb66d3298d8d6381c39199ffd10b3ae3dc8ffaf Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Wed, 7 Mar 2007 11:24:59 +0000 Subject: [PATCH 07/38] Add --quiet option passed to git-fast-import(1) Signed-off-by: Rocco Rutte --- hg2git.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/hg2git.sh b/hg2git.sh index ea25f9a..46ce727 100755 --- a/hg2git.sh +++ b/hg2git.sh @@ -1,6 +1,6 @@ #!/bin/sh -USAGE='[-m max] repo' +USAGE='[-m max] [--quiet] repo' LONG_USAGE='Import hg repository up to either tip or ' ROOT="`dirname $0`" REPO="" @@ -9,6 +9,7 @@ PFX="hg2git" SFX_MARKS="marks" SFX_HEADS="heads" SFX_STATE="state" +QUIET="" . git-sh-setup cd_to_toplevel @@ -20,6 +21,9 @@ do shift MAX="$1" ;; + --q|--qu|--qui|--quie|--quiet) + QUIET="--quiet" + ;; -*) usage ;; @@ -48,7 +52,7 @@ GIT_DIR="$GIT_DIR" python "$ROOT/hg2git.py" \ "$GIT_DIR/$PFX-$SFX_MARKS" \ "$GIT_DIR/$PFX-$SFX_HEADS" \ "$GIT_DIR/$PFX-$SFX_STATE" \ -| git-fast-import --export-marks="$GIT_DIR/$PFX-$SFX_MARKS.tmp" \ +| git-fast-import $QUIET --export-marks="$GIT_DIR/$PFX-$SFX_MARKS.tmp" \ || die 'Git fast-import failed' # move recent marks cache out of the way... From 799a4d86829a538a898e801a5080e089e8aab815 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Wed, 7 Mar 2007 11:33:03 +0000 Subject: [PATCH 08/38] Only attempt to export tags in our input range This prints much less noise and potentially makes git-fast-import(1) less confused. Signed-off-by: Rocco Rutte --- hg2git.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/hg2git.py b/hg2git.py index 8f2ee22..341e663 100644 --- a/hg2git.py +++ b/hg2git.py @@ -171,16 +171,19 @@ def export_commit(ui,repo,revision,marks,heads,last,max,count): wr() return checkpoint(count) -def export_tags(ui,repo,cache,count): +def export_tags(ui,repo,marks_cache,start,end,count): l=repo.tagslist() for tag,node in l: - if tag=='tip': - continue + # ignore latest revision + if tag=='tip': continue rev=repo.changelog.rev(node) - ref=cache.get(str(rev),None) + # ignore those tags not in our import range + if rev=end: continue + + ref=marks_cache.get(str(rev),None) if ref==None: - #sys.stderr.write('Failed to find reference for creating tag' - # ' %s at r%d\n' % (tag,rev)) + sys.stderr.write('Failed to find reference for creating tag' + ' %s at r%d\n' % (tag,rev)) continue (_,user,(time,timezone),_,desc,branch,_)=get_changeset(ui,repo,rev) sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref)) @@ -259,7 +262,9 @@ if __name__=='__main__': for rev in range(min,max): c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,tip,c) - c=export_tags(ui,repo,marks_cache,c) + c=export_tags(ui,repo,marks_cache,min,max,c) + + sys.stderr.write('Issued %d commands\n' % c) state_cache['tip']=max state_cache['repo']=repourl From c002051c82e22ca816afd1475885fc38e92693ad Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Wed, 7 Mar 2007 11:38:56 +0000 Subject: [PATCH 09/38] Distinct between added, changed and removed files for UI output To git-fast-import(1) we feed in changed and added files completely, so thers's no real difference except UI output (potentially for debugging). Signed-off-by: Rocco Rutte --- hg2git.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/hg2git.py b/hg2git.py index 341e663..a081b87 100644 --- a/hg2git.py +++ b/hg2git.py @@ -66,29 +66,31 @@ def get_parent_mark(parent,marks): otherwise the SHA1 from the cache.""" return marks.get(str(parent+1),':%d' % (parent+1)) -def mismatch(x,f1,f2): +def mismatch(f1,f2): """See if two revisions of a file are not equal.""" return node.hex(f1)!=node.hex(f2) -def outer_set(dleft,dright,l,r): - """Loop over our repository in and find all changed and missing files.""" +def outer_set(dleft,dright,l,c,r): + """Loop over our repository and find all changed and missing files.""" for left in dleft.keys(): right=dright.get(left,None) - if right==None or mismatch('A',dleft[left],right): - # if either have the current file not in parent or the - # checksums differ: add it to changed files + if right==None: + # we have the file but our parent hasn't: add to left set l.append(left) + elif mismatch(dleft[left],right): + # we have it but checksums mismatch: add to center set + c.append(left) for right in dright.keys(): left=dleft.get(right,None) if left==None: - # if we have a file in the parent but not our manifest, - # add it to deleted files; checksums are checked earlier + # if parent has file but we don't: add to right set r.append(right) - return l,r + # change is already handled when comparing child against parent + return l,c,r def get_filechanges(repo,revision,parents,mleft): """Given some repository and revision, find all changed/deleted files.""" - l,r=[],[] + l,c,r=[],[],[] for p in parents: if p<0: continue mright=repo.changectx(p).manifest() @@ -96,8 +98,8 @@ def get_filechanges(repo,revision,parents,mleft): dleft.sort() dright=mright.keys() dright.sort() - l,r=outer_set(mleft,mright,l,r) - return l,r + l,c,r=outer_set(mleft,mright,l,c,r) + return l,c,r def export_commit(ui,repo,revision,marks,heads,last,max,count): (_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision) @@ -153,12 +155,12 @@ def export_commit(ui,repo,revision,marks,heads,last,max,count): ctx=repo.changectx(str(revision)) man=ctx.manifest() - added,removed=get_filechanges(repo,revision,parents,man) + added,changed,removed=get_filechanges(repo,revision,parents,man) - sys.stderr.write('Exporting revision %d with %d changed/%d removed files\n' % - (revision,len(added),len(removed))) + sys.stderr.write('Exporting revision %d with %d/%d/%d added/changed/removed files\n' % + (revision,len(added),len(changed),len(removed))) - for a in added: + for a in added+changed: fctx=ctx.filectx(a) d=fctx.data() wr('M %s inline %s' % (gitmode(man.execf(a)),a)) From 2eba1c38ec9a8d72540bae94160ca3ba5546e189 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Wed, 7 Mar 2007 15:10:39 +0000 Subject: [PATCH 10/38] Add some simpe readme with legal stuff, usage and some notes Signed-off-by: Rocco Rutte --- README.txt | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 README.txt diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..e25075b --- /dev/null +++ b/README.txt @@ -0,0 +1,33 @@ +hg2git.(sh|py) - mercurial to git converter using git-fast-import + +Legal +===== + +The scripts are licensed under the GPL version 2 and were written by +Rocco Rutte with hints and help from the git list and +#mercurial on freenode. + +Usage +===== + +Using it is quite simple for a mercurial repository : + + mkdir repo-git # or whatever + cd repo-git + git init + hg2git.sh + +Incremental imports to track hg repos is supported, too. + +Notes +===== + +As there's no real config interface to hg2git.py (the worker script), +checkpointing each 1000 changesets is hard-coded. "checkpointing" means +to issue the "checkpoint" command of git-fast-import which then flushes +the current pack file and starts a new one. This is sufficient for the +initial import. + +However, per incremental import with fewer than 1000 changesets (read: +most likely always), a new pack file will be created. Every time. As a +consequence, the git repo should be repacked quite often. From dbac8741df3d5cbe87404c9dedc07b77d2dc3023 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Thu, 8 Mar 2007 09:37:23 +0000 Subject: [PATCH 11/38] Don't use checkpoint As git-fast-import already generates at least one pack per run, don't even further split these up on a (default) 1k changeset boundary. Also rework the documentation on that one a little. Signed-off-by: Rocco Rutte --- README.txt | 12 +++--------- hg2git.py | 6 +++--- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/README.txt b/README.txt index e25075b..368e7ea 100644 --- a/README.txt +++ b/README.txt @@ -22,12 +22,6 @@ Incremental imports to track hg repos is supported, too. Notes ===== -As there's no real config interface to hg2git.py (the worker script), -checkpointing each 1000 changesets is hard-coded. "checkpointing" means -to issue the "checkpoint" command of git-fast-import which then flushes -the current pack file and starts a new one. This is sufficient for the -initial import. - -However, per incremental import with fewer than 1000 changesets (read: -most likely always), a new pack file will be created. Every time. As a -consequence, the git repo should be repacked quite often. +As each git-fast-import run creates a new pack file, it may be required +to repack the repository quite often for incremental imports (especially +when importing a small number of changesets per incremental import). diff --git a/hg2git.py b/hg2git.py index a081b87..ee95dfb 100644 --- a/hg2git.py +++ b/hg2git.py @@ -17,8 +17,8 @@ import os user_re=re.compile('[^<]+ <[^>]+>$') # git branch for hg's default 'HEAD' branch cfg_master='master' -# insert 'checkpoint' command after this many commits -cfg_checkpoint_count=1000 +# insert 'checkpoint' command after this many commits or none at all if 0 +cfg_checkpoint_count=0 def usage(ret): sys.stderr.write(__doc__) @@ -54,7 +54,7 @@ def wr(msg=''): def checkpoint(count): count=count+1 - if count%cfg_checkpoint_count==0: + if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0: sys.stderr.write("Checkpoint after %d commits\n" % count) wr('checkpoint') wr() From 69e2d5df9b17b04802a00f898b8019d7f7dd59a0 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Thu, 8 Mar 2007 09:37:41 +0000 Subject: [PATCH 12/38] Add some degign notes to readme Signed-off-by: Rocco Rutte --- README.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.txt b/README.txt index 368e7ea..d28a096 100644 --- a/README.txt +++ b/README.txt @@ -25,3 +25,12 @@ Notes As each git-fast-import run creates a new pack file, it may be required to repack the repository quite often for incremental imports (especially when importing a small number of changesets per incremental import). + +Design +====== + +hg2git.py was designed in a way that doesn't require a 2-pass mechanism +or any prior repository analysis: if just feeds what it finds into +git-fast-import. This also implies that it heavily relies on strictly +linear ordering of changesets from hg, i.e. its append-only storage +model so that changesets hg2git already saw never get modified. From e2edb79bc2c2168aa7e656502a480d1a24190a24 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Thu, 8 Mar 2007 10:12:01 +0000 Subject: [PATCH 13/38] Add note about SHA1 stability to readme Signed-off-by: Rocco Rutte --- README.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.txt b/README.txt index d28a096..4959461 100644 --- a/README.txt +++ b/README.txt @@ -34,3 +34,12 @@ or any prior repository analysis: if just feeds what it finds into git-fast-import. This also implies that it heavily relies on strictly linear ordering of changesets from hg, i.e. its append-only storage model so that changesets hg2git already saw never get modified. + +Import and SHA stability +======================== + +Currently it's only supported to map one hg repository to one git +repository. However, all forks of a hg repo can be imported into one git +repo each and then merged together (e.g. as different branches in the +final git repo) since the checksums are stable, i.e. one particular hg +changeset always produces the same git SHA1 checksum. From 61bb1cb70794df21b626159ff164839d7915907e Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Thu, 8 Mar 2007 11:16:28 +0000 Subject: [PATCH 14/38] hg2git.sh: Try to get last hg repo url from state file As one usually mirrors a git repo based on the same hg repo, it's annoying having to specify the hg repo url all the time. Since we intentionally stored the url in the state file already, use in the shell wrapper and default to it. Signed-off-by: Rocco Rutte --- hg2git.sh | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/hg2git.sh b/hg2git.sh index 46ce727..5de3270 100755 --- a/hg2git.sh +++ b/hg2git.sh @@ -1,7 +1,5 @@ #!/bin/sh -USAGE='[-m max] [--quiet] repo' -LONG_USAGE='Import hg repository up to either tip or ' ROOT="`dirname $0`" REPO="" MAX="-1" @@ -11,6 +9,11 @@ SFX_HEADS="heads" SFX_STATE="state" QUIET="" +USAGE="[-m ] [--quiet] []" +LONG_USAGE="Import hg repository up to either tip or +If is omitted, use last hg repository as obtained from state file, +GIT_DIR/$PFX-$SFX_STATE by default." + . git-sh-setup cd_to_toplevel @@ -34,12 +37,20 @@ do shift done -if [ "$#" != 1 ] ; then - usage - exit 1 +# for convenience: get default repo from state file +if [ "$#" != 1 -a -f "$GIT_DIR/$PFX-$SFX_STATE" ] ; then + REPO="`egrep '^:repo ' "$GIT_DIR/$PFX-$SFX_STATE" | cut -d ' ' -f 2`" + echo "Using last hg repository \"$REPO\"" fi -REPO="$1" +if [ x"$REPO" = x ] ; then + if [ "$#" != 1 ] ; then + usage + exit 1 + else + REPO="$1" + fi +fi # make sure we have a marks cache if [ ! -f "$GIT_DIR/$PFX-$SFX_MARKS" ] ; then From 85f0d9c88103d6ffa27549dd46e3341438e6257a Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Thu, 8 Mar 2007 11:21:21 +0000 Subject: [PATCH 15/38] hg2git.py: Refactor main code into hg2git() function Now this can even be used as a module from other python scripts by simply calling the hg2git() function. Except some config values nobody really ever wants to change, it's even save to run several hg2git() functions in parallel as no global vars or the like are used by intention (but it makes the code uglier). Signed-off-by: Rocco Rutte --- hg2git.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/hg2git.py b/hg2git.py index ee95dfb..1683cb5 100644 --- a/hg2git.py +++ b/hg2git.py @@ -238,9 +238,7 @@ def verify_heads(ui,repo,cache): '\n%s (repo) != %s (cache)\n' % (b,sha1,c)) return True -if __name__=='__main__': - if len(sys.argv)!=6: sys.exit(usage(1)) - repourl,m,marksfile,headsfile,tipfile=sys.argv[1:] +def hg2git(repourl,m,marksfile,headsfile,tipfile): _max=int(m) marks_cache=load_cache(marksfile) @@ -250,7 +248,7 @@ if __name__=='__main__': ui,repo=setup_repo(repourl) if not verify_heads(ui,repo,heads_cache): - sys.exit(1) + return 1 tip=repo.changelog.count() @@ -271,3 +269,10 @@ if __name__=='__main__': state_cache['tip']=max state_cache['repo']=repourl save_cache(tipfile,state_cache) + + return 0 + +if __name__=='__main__': + if len(sys.argv)!=6: sys.exit(usage(1)) + repourl,m,marksfile,headsfile,tipfile=sys.argv[1:] + sys.exit(hg2git(repourl,m,marksfile,headsfile,tipfile)) From ec2aceeacbd3c35946febc2a3449e24ab38bb5d3 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Thu, 8 Mar 2007 11:46:41 +0000 Subject: [PATCH 16/38] Add Todo section to readme Signed-off-by: Rocco Rutte --- README.txt | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/README.txt b/README.txt index 4959461..4f57f72 100644 --- a/README.txt +++ b/README.txt @@ -43,3 +43,20 @@ repository. However, all forks of a hg repo can be imported into one git repo each and then merged together (e.g. as different branches in the final git repo) since the checksums are stable, i.e. one particular hg changeset always produces the same git SHA1 checksum. + +Todo +==== + +For incremental imports, handling tags needs to be reworked (maybe): +Right now we assume that once a tag is created, it stays forever and +never changes. However, + + 1) tags in hg may be removed + 2) tags may change + +I'm not yet sure how to handle this and how this interferes with +non-hg-based tags in git. + +The same for branches: They may get removed. + +For one-time conversions, everything is fine. From 31985600163b6f242bfe69aeba21da0fc00b44a7 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Fri, 9 Mar 2007 11:09:57 +0000 Subject: [PATCH 17/38] hg2git.py: Add support for user-defined hg author -> git author mapping The mapping is a python dictionary given to the hg2git() function. This isn't extremely useful as there's no option passing from hg2git.sh to hg2git.py (yet). Signed-off-by: Rocco Rutte --- hg2git.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/hg2git.py b/hg2git.py index 1683cb5..046ad1f 100644 --- a/hg2git.py +++ b/hg2git.py @@ -28,12 +28,16 @@ def setup_repo(url): myui=ui.ui() return myui,hg.repository(myui,url) -def get_changeset(ui,repo,revision): +def get_changeset(ui,repo,revision,authors): def get_branch(name): if name=='HEAD': name=cfg_master return name - def fixup_user(user): + def fixup_user(user,authors): + if authors!=None: + # if we have an authors table, try to get mapping + # by defaultung to the current value of 'user' + user=authors.get(user,user) if user_re.match(user)==None: if '@' not in user: return user+' ' @@ -43,7 +47,7 @@ def get_changeset(ui,repo,revision): (manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node) tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60)) branch=get_branch(extra.get('branch','master')) - return (manifest,fixup_user(user),(time,tz),files,desc,branch,extra) + return (manifest,fixup_user(user,authors),(time,tz),files,desc,branch,extra) def gitmode(x): return x and '100755' or '100644' @@ -101,8 +105,8 @@ def get_filechanges(repo,revision,parents,mleft): l,c,r=outer_set(mleft,mright,l,c,r) return l,c,r -def export_commit(ui,repo,revision,marks,heads,last,max,count): - (_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision) +def export_commit(ui,repo,revision,marks,heads,last,max,count,authors): + (_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors) parents=repo.changelog.parentrevs(revision) wr('commit refs/heads/%s' % branch) @@ -173,7 +177,7 @@ def export_commit(ui,repo,revision,marks,heads,last,max,count): wr() return checkpoint(count) -def export_tags(ui,repo,marks_cache,start,end,count): +def export_tags(ui,repo,marks_cache,start,end,count,authors): l=repo.tagslist() for tag,node in l: # ignore latest revision @@ -187,7 +191,7 @@ def export_tags(ui,repo,marks_cache,start,end,count): sys.stderr.write('Failed to find reference for creating tag' ' %s at r%d\n' % (tag,rev)) continue - (_,user,(time,timezone),_,desc,branch,_)=get_changeset(ui,repo,rev) + (_,user,(time,timezone),_,desc,branch,_)=get_changeset(ui,repo,rev,authors) sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref)) wr('tag %s' % tag) wr('from %s' % ref) @@ -238,7 +242,7 @@ def verify_heads(ui,repo,cache): '\n%s (repo) != %s (cache)\n' % (b,sha1,c)) return True -def hg2git(repourl,m,marksfile,headsfile,tipfile): +def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={}): _max=int(m) marks_cache=load_cache(marksfile) @@ -260,9 +264,9 @@ def hg2git(repourl,m,marksfile,headsfile,tipfile): c=0 last={} for rev in range(min,max): - c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,tip,c) + c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,tip,c,authors) - c=export_tags(ui,repo,marks_cache,min,max,c) + c=export_tags(ui,repo,marks_cache,min,max,c,authors) sys.stderr.write('Issued %d commands\n' % c) From 796fa5f4cb3ed93948b2efef6273fa12c64c325c Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Fri, 9 Mar 2007 12:07:08 +0000 Subject: [PATCH 18/38] hg2git.py: Add support for extracting authorship from Signed-off-by lines Unfortunately, it's not configurable yet (read: cannot be disabled) as it may take some time to match against regex all the time (especially from some initial import). This also enables cleaning up usernames by stripping silly leading and trailing chars like '"' (which is the only one supported ATM). Signed-off-by: Rocco Rutte --- hg2git.py | 79 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 68 insertions(+), 11 deletions(-) diff --git a/hg2git.py b/hg2git.py index 046ad1f..e328a42 100644 --- a/hg2git.py +++ b/hg2git.py @@ -13,8 +13,12 @@ import re import sys import os +# silly regex to catch Signed-off-by lines in log message +sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$') # silly regex to see if user field has email address -user_re=re.compile('[^<]+ <[^>]+>$') +user_re=re.compile('([^<]+) (<[^>]+>)$') +# silly regex to clean out user names +user_clean_re=re.compile('^["]([^"]+)["]$') # git branch for hg's default 'HEAD' branch cfg_master='master' # insert 'checkpoint' command after this many commits or none at all if 0 @@ -28,21 +32,36 @@ def setup_repo(url): myui=ui.ui() return myui,hg.repository(myui,url) +def fixup_user(user,authors): + if authors!=None: + # if we have an authors table, try to get mapping + # by defaulting to the current value of 'user' + user=authors.get(user,user) + name,mail,m='','',user_re.match(user) + if m==None: + # if we don't have 'Name ' syntax, use 'user + # ' if use contains no at and + # 'user ' otherwise + name=user + if '@' not in user: + mail='' + else: + mail='<%s>' % user + else: + # if we have 'Name ' syntax, everything is fine :) + name,mail=m.group(1),m.group(2) + + # remove any silly quoting from username + m2=user_clean_re.match(name) + if m2!=None: + name=m2.group(1) + return '%s %s' % (name,mail) + def get_changeset(ui,repo,revision,authors): def get_branch(name): if name=='HEAD': name=cfg_master return name - def fixup_user(user,authors): - if authors!=None: - # if we have an authors table, try to get mapping - # by defaultung to the current value of 'user' - user=authors.get(user,user) - if user_re.match(user)==None: - if '@' not in user: - return user+' ' - return user+' <'+user+'>' - return user node=repo.lookup(revision) (manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node) tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60)) @@ -105,12 +124,50 @@ def get_filechanges(repo,revision,parents,mleft): l,c,r=outer_set(mleft,mright,l,c,r) return l,c,r +def get_author(logmessage,committer,authors): + """As git distincts between author and committer of a patch, try to + extract author by detecting Signed-off-by lines. + + This walks from the end of the log message towards the top skipping + empty lines. Upon the first non-empty line, it walks all Signed-off-by + lines upwards to find the first one. For that (if found), it extracts + authorship information the usual way (authors table, cleaning, etc.) + + If no Signed-off-by line is found, this defaults to the committer. + + This may sound stupid (and it somehow is), but in log messages we + accidentially may have lines in the middle starting with + "Signed-off-by: foo" and thus matching our detection regex. Prevent + that.""" + + loglines=logmessage.split('\n') + i=len(loglines) + # from tail walk to top skipping empty lines + while i>=0: + i-=1 + if len(loglines[i].strip())==0: continue + break + if i>=0: + # walk further upwards to find first sob line, store in 'first' + first=None + while i>=0: + m=sob_re.match(loglines[i]) + if m==None: break + first=m + i-=1 + # if the last non-empty line matches our Signed-Off-by regex: extract username + if first!=None: + r=fixup_user(first.group(1),authors) + return r + return committer + def export_commit(ui,repo,revision,marks,heads,last,max,count,authors): (_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors) parents=repo.changelog.parentrevs(revision) wr('commit refs/heads/%s' % branch) wr('mark :%d' % (revision+1)) + wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone)) wr('committer %s %d %s' % (user,time,timezone)) wr('data %d' % (len(desc)+1)) # wtf? wr(desc) From af3810ae2fd882cf841a0c6178c9e6e0eaa52164 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Sat, 10 Mar 2007 14:28:45 +0000 Subject: [PATCH 19/38] Only attempt to verify heads hg has, too In the git repo there may be any number branches that are not hg imported branches, so it doesn't make sense to print warnings when a non-hg head isn't at what it was last time. Now we get a list of branchtags hg has and only verify these. Signed-off-by: Rocco Rutte --- hg2git.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/hg2git.py b/hg2git.py index e328a42..129272a 100644 --- a/hg2git.py +++ b/hg2git.py @@ -57,11 +57,12 @@ def fixup_user(user,authors): name=m2.group(1) return '%s %s' % (name,mail) +def get_branch(name): + if name=='HEAD': + name=cfg_master + return name + def get_changeset(ui,repo,revision,authors): - def get_branch(name): - if name=='HEAD': - name=cfg_master - return name node=repo.lookup(revision) (manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node) tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60)) @@ -290,7 +291,13 @@ def verify_heads(ui,repo,cache): f.close() return sha1 - for b in cache.keys(): + # get list of hg's branches to verify, don't take all git has + branches=repo.branchtags() + l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()] + l.sort() + + for _,_,b in l: + b=get_branch(b) sys.stderr.write('Verifying branch [%s]\n' % b) sha1=getsha1(b) c=cache.get(b) From 2030a3a736d41c2ee9bff3e4d89e22706389938d Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Sat, 10 Mar 2007 14:29:49 +0000 Subject: [PATCH 20/38] Rename README.txt to hg2git.txt This is to avoid naming clashes since I'm more or less about to merge hg2git into fast-export.git at repo.or.cz. Signed-off-by: Rocco Rutte --- README.txt => hg2git.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename README.txt => hg2git.txt (100%) diff --git a/README.txt b/hg2git.txt similarity index 100% rename from README.txt rename to hg2git.txt From 045eea436caca208a5ae4d08f3d31a550808c993 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Mon, 12 Mar 2007 07:33:40 +0000 Subject: [PATCH 21/38] Basic support for command line options in hg2git.py Signed-off-by: Rocco Rutte --- hg2git.py | 36 ++++++++++++++++++++++++++++++++---- hg2git.sh | 29 ++++++++++------------------- 2 files changed, 42 insertions(+), 23 deletions(-) diff --git a/hg2git.py b/hg2git.py index 129272a..b816eed 100644 --- a/hg2git.py +++ b/hg2git.py @@ -9,6 +9,7 @@ Usage: hg2git.py from mercurial import repo,hg,cmdutil,util,ui,revlog,node from tempfile import mkstemp +from optparse import OptionParser import re import sys import os @@ -322,7 +323,7 @@ def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={}): min=int(state_cache.get('tip',0)) max=_max - if _max<0: + if _max<=0: max=tip c=0 @@ -341,6 +342,33 @@ def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={}): return 0 if __name__=='__main__': - if len(sys.argv)!=6: sys.exit(usage(1)) - repourl,m,marksfile,headsfile,tipfile=sys.argv[1:] - sys.exit(hg2git(repourl,m,marksfile,headsfile,tipfile)) + def bail(parser,opt): + sys.stderr.write('Error: No %s option given\n' % opt) + parser.print_help() + sys.exit(2) + + parser=OptionParser() + + parser.add_option("-m","--max",type="int",dest="max", + help="Maximum hg revision to import") + parser.add_option("--marks",dest="marksfile", + help="File to read git-fast-import's marks from") + parser.add_option("--heads",dest="headsfile", + help="File to read last run's git heads from") + parser.add_option("--status",dest="statusfile", + help="File to read status from") + parser.add_option("-r","--repo",dest="repourl", + help="URL of repo to import") + + (options,args)=parser.parse_args() + + m=0 + if options.max!=None: m=options.max + + if options.marksfile==None: bail(parser,'--marks') + if options.marksfile==None: bail(parser,'--heads') + if options.marksfile==None: bail(parser,'--status') + if options.marksfile==None: bail(parser,'--repo') + + sys.exit(hg2git(options.repourl,m,options.marksfile,options.headsfile, + options.headsfile)) diff --git a/hg2git.sh b/hg2git.sh index 5de3270..34d5227 100755 --- a/hg2git.sh +++ b/hg2git.sh @@ -2,7 +2,6 @@ ROOT="`dirname $0`" REPO="" -MAX="-1" PFX="hg2git" SFX_MARKS="marks" SFX_HEADS="heads" @@ -20,15 +19,16 @@ cd_to_toplevel while case "$#" in 0) break ;; esac do case "$1" in - -m) + -r|--r|--re|--rep|--repo) shift - MAX="$1" + REPO="$1" ;; --q|--qu|--qui|--quie|--quiet) QUIET="--quiet" ;; -*) - usage + # pass any other options down to hg2git.py + break ;; *) break @@ -38,31 +38,22 @@ do done # for convenience: get default repo from state file -if [ "$#" != 1 -a -f "$GIT_DIR/$PFX-$SFX_STATE" ] ; then +if [ x"$REPO" = x -a -f "$GIT_DIR/$PFX-$SFX_STATE" ] ; then REPO="`egrep '^:repo ' "$GIT_DIR/$PFX-$SFX_STATE" | cut -d ' ' -f 2`" echo "Using last hg repository \"$REPO\"" fi -if [ x"$REPO" = x ] ; then - if [ "$#" != 1 ] ; then - usage - exit 1 - else - REPO="$1" - fi -fi - # make sure we have a marks cache if [ ! -f "$GIT_DIR/$PFX-$SFX_MARKS" ] ; then touch "$GIT_DIR/$PFX-$SFX_MARKS" fi GIT_DIR="$GIT_DIR" python "$ROOT/hg2git.py" \ - "$REPO" \ - "$MAX" \ - "$GIT_DIR/$PFX-$SFX_MARKS" \ - "$GIT_DIR/$PFX-$SFX_HEADS" \ - "$GIT_DIR/$PFX-$SFX_STATE" \ + --repo "$REPO" \ + --marks "$GIT_DIR/$PFX-$SFX_MARKS" \ + --heads "$GIT_DIR/$PFX-$SFX_HEADS" \ + --status "$GIT_DIR/$PFX-$SFX_STATE" \ + "$@" \ | git-fast-import $QUIET --export-marks="$GIT_DIR/$PFX-$SFX_MARKS.tmp" \ || die 'Git fast-import failed' From 469d4f33054fad0e651484fb61f0920a5b60a430 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Mon, 12 Mar 2007 08:00:18 +0000 Subject: [PATCH 22/38] hg2git.py: Disable parsing Signef-off-by lines and add -s to enable IMHO it's highly unusual to have these lines in hg projects but who knows. As it's slow to parse these types of lines (with regex), it's disabled by default and the 'author' command of git-fast-import isn't used at all. It can be enabled by giving the -s switch to hg2git.sh. Signed-off-by: Rocco Rutte --- hg2git.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/hg2git.py b/hg2git.py index b816eed..9d832c3 100644 --- a/hg2git.py +++ b/hg2git.py @@ -163,13 +163,14 @@ def get_author(logmessage,committer,authors): return r return committer -def export_commit(ui,repo,revision,marks,heads,last,max,count,authors): +def export_commit(ui,repo,revision,marks,heads,last,max,count,authors,sob): (_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors) parents=repo.changelog.parentrevs(revision) wr('commit refs/heads/%s' % branch) wr('mark :%d' % (revision+1)) - wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone)) + if sob: + wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone)) wr('committer %s %d %s' % (user,time,timezone)) wr('data %d' % (len(desc)+1)) # wtf? wr(desc) @@ -220,8 +221,8 @@ def export_commit(ui,repo,revision,marks,heads,last,max,count,authors): man=ctx.manifest() added,changed,removed=get_filechanges(repo,revision,parents,man) - sys.stderr.write('Exporting revision %d with %d/%d/%d added/changed/removed files\n' % - (revision,len(added),len(changed),len(removed))) + sys.stderr.write('Exporting revision %d/%d with %d/%d/%d added/changed/removed files\n' % + (revision,max,len(added),len(changed),len(removed))) for a in added+changed: fctx=ctx.filectx(a) @@ -307,7 +308,7 @@ def verify_heads(ui,repo,cache): '\n%s (repo) != %s (cache)\n' % (b,sha1,c)) return True -def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={}): +def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={},sob=False): _max=int(m) marks_cache=load_cache(marksfile) @@ -323,13 +324,13 @@ def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={}): min=int(state_cache.get('tip',0)) max=_max - if _max<=0: + if _max<0: max=tip c=0 last={} for rev in range(min,max): - c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,tip,c,authors) + c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,tip,c,authors,sob) c=export_tags(ui,repo,marks_cache,min,max,c,authors) @@ -359,10 +360,12 @@ if __name__=='__main__': help="File to read status from") parser.add_option("-r","--repo",dest="repourl", help="URL of repo to import") + parser.add_option("-s",action="store_true",dest="sob", + default=False,help="Enable parsing Signed-off-by lines") (options,args)=parser.parse_args() - m=0 + m=-1 if options.max!=None: m=options.max if options.marksfile==None: bail(parser,'--marks') @@ -371,4 +374,4 @@ if __name__=='__main__': if options.marksfile==None: bail(parser,'--repo') sys.exit(hg2git(options.repourl,m,options.marksfile,options.headsfile, - options.headsfile)) + options.headsfile,sob=options.sob)) From 80f028a16c51bef1284984941e8e248b42cd55d7 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Mon, 12 Mar 2007 08:12:08 +0000 Subject: [PATCH 23/38] hg2git.py: Display our max revision as progress, not tip Displaying tip doesn't make sense when we have some max given with -m/--max. Signed-off-by: Rocco Rutte --- hg2git.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hg2git.py b/hg2git.py index 9d832c3..1079684 100644 --- a/hg2git.py +++ b/hg2git.py @@ -330,7 +330,7 @@ def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={},sob=False): c=0 last={} for rev in range(min,max): - c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,tip,c,authors,sob) + c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,max,c,authors,sob) c=export_tags(ui,repo,marks_cache,min,max,c,authors) From 20b4ca920b8b9031818a961a88b5b3f6f75e07ee Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Mon, 12 Mar 2007 08:13:40 +0000 Subject: [PATCH 24/38] hg2git.py: Fix typo saving status to headsfile instead of statusfile This broke incremental imports as hg2git.sh wrapper overwrites headsfile with current values after the import is done. Signed-off-by: Rocco Rutte --- hg2git.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hg2git.py b/hg2git.py index 1079684..8955adc 100644 --- a/hg2git.py +++ b/hg2git.py @@ -374,4 +374,4 @@ if __name__=='__main__': if options.marksfile==None: bail(parser,'--repo') sys.exit(hg2git(options.repourl,m,options.marksfile,options.headsfile, - options.headsfile,sob=options.sob)) + options.statusfile,sob=options.sob)) From 230a320c843dd3b7d79cc5a69ca8429ade296008 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Mon, 12 Mar 2007 08:54:30 +0000 Subject: [PATCH 25/38] Basic support for an author map As git-(cvs|svn)import support it, make futrue git-hgimport :) support it, too. Signed-off-by: Rocco Rutte --- hg2git.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/hg2git.py b/hg2git.py index 8955adc..e5991c1 100644 --- a/hg2git.py +++ b/hg2git.py @@ -264,6 +264,25 @@ def export_tags(ui,repo,marks_cache,start,end,count,authors): count=checkpoint(count) return count +def load_authors(filename): + cache={} + if not os.path.exists(filename): + return cache + f=open(filename,'r') + l=0 + lre=re.compile('^([^= ]+)[ ]*=[ ]*(.+)$') + for line in f.readlines(): + l+=1 + m=lre.match(line) + if m==None: + sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l)) + continue + # put key:value in cache, key without ^: + cache[m.group(1)]=m.group(2) + f.close() + sys.stderr.write('Loaded %d authors\n' % l) + return cache + def load_cache(filename): cache={} if not os.path.exists(filename): @@ -362,6 +381,8 @@ if __name__=='__main__': help="URL of repo to import") parser.add_option("-s",action="store_true",dest="sob", default=False,help="Enable parsing Signed-off-by lines") + parser.add_option("-A","--authors",dest="authorfile", + help="Read authormap from AUTHORFILE") (options,args)=parser.parse_args() @@ -373,5 +394,9 @@ if __name__=='__main__': if options.marksfile==None: bail(parser,'--status') if options.marksfile==None: bail(parser,'--repo') + a={} + if options.authorfile!=None: + a=load_authors(options.authorfile) + sys.exit(hg2git(options.repourl,m,options.marksfile,options.headsfile, - options.statusfile,sob=options.sob)) + options.statusfile,authors=a,sob=options.sob)) From 7f9098823c05d9af3ebfb3e3cedd3aed0c4afcbe Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Mon, 12 Mar 2007 09:06:48 +0000 Subject: [PATCH 26/38] hg2git.sh: Complete --help output's option listing This also adds some more verbose descriptions than the bare listing. Signed-off-by: Rocco Rutte --- hg2git.sh | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/hg2git.sh b/hg2git.sh index 34d5227..3dd9d5b 100755 --- a/hg2git.sh +++ b/hg2git.sh @@ -8,10 +8,19 @@ SFX_HEADS="heads" SFX_STATE="state" QUIET="" -USAGE="[-m ] [--quiet] []" +USAGE="[-m ] [--quiet] [-s] [-A ] [-r ]" LONG_USAGE="Import hg repository up to either tip or If is omitted, use last hg repository as obtained from state file, -GIT_DIR/$PFX-$SFX_STATE by default." +GIT_DIR/$PFX-$SFX_STATE by default. + +Options: + -m Maximum revision to import + --quiet Passed to git-fast-import(1) + -s Enable parsing Signed-off-by lines + -A Read author map from file + (Same as in git-svnimport(1) and git-cvsimport(1)) + -r Mercurial repository to import +" . git-sh-setup cd_to_toplevel From 75dc075d52ef89396e60ba7f519224058e060b24 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Mon, 12 Mar 2007 10:25:01 +0000 Subject: [PATCH 27/38] hg2git.sh: Add usage note that argument order matters Signed-off-by: Rocco Rutte --- hg2git.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hg2git.sh b/hg2git.sh index 3dd9d5b..8bd96f4 100755 --- a/hg2git.sh +++ b/hg2git.sh @@ -8,11 +8,13 @@ SFX_HEADS="heads" SFX_STATE="state" QUIET="" -USAGE="[-m ] [--quiet] [-s] [-A ] [-r ]" +USAGE="[--quiet] [-r ] [-m ] [-s] [-A ]" LONG_USAGE="Import hg repository up to either tip or If is omitted, use last hg repository as obtained from state file, GIT_DIR/$PFX-$SFX_STATE by default. +Note: The argument order matters. + Options: -m Maximum revision to import --quiet Passed to git-fast-import(1) From 59a481a2b0b4e73db903fe7773aa122fa74fd9d4 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Mon, 12 Mar 2007 10:26:46 +0000 Subject: [PATCH 28/38] hg2git.py: Allow for spaces in authorfile By allowing spaces in keys we allow for (re-)mapping complete lines like "Joe User " to be mapped to something else. Signed-off-by: Rocco Rutte --- hg2git.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hg2git.py b/hg2git.py index e5991c1..bcef2e1 100644 --- a/hg2git.py +++ b/hg2git.py @@ -270,7 +270,7 @@ def load_authors(filename): return cache f=open(filename,'r') l=0 - lre=re.compile('^([^= ]+)[ ]*=[ ]*(.+)$') + lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$') for line in f.readlines(): l+=1 m=lre.match(line) From cedbd0fb86e221755161fa80c8b04fdabb340308 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Mon, 12 Mar 2007 10:45:32 +0000 Subject: [PATCH 29/38] hg2git.py: Remove leading/trailing spaces from authormap The current regex may leave us with keys/values having trailing/leading spaces in all flavours which will break lookup. Solution: strip() key and value. Signed-off-by: Rocco Rutte --- hg2git.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hg2git.py b/hg2git.py index bcef2e1..9ac395b 100644 --- a/hg2git.py +++ b/hg2git.py @@ -278,7 +278,7 @@ def load_authors(filename): sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l)) continue # put key:value in cache, key without ^: - cache[m.group(1)]=m.group(2) + cache[m.group(1).strip()]=m.group(2).strip() f.close() sys.stderr.write('Loaded %d authors\n' % l) return cache From e448736a0b0d4b2d280e44843ef814ef575bf302 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Mon, 12 Mar 2007 11:05:33 +0000 Subject: [PATCH 30/38] Remove SHA stability note from readme Turns out, it isn't true though the diffs are still empty, i.e. $ git log --name-status --no-merges b1..b2 produces bogus output when b1 is the a branch from the main hg repo and b2 a fork of it. But $ git diff b1..b2 still produces the correct result. Signed-off-by: Rocco Rutte --- hg2git.txt | 9 --------- 1 file changed, 9 deletions(-) diff --git a/hg2git.txt b/hg2git.txt index 4f57f72..13a2035 100644 --- a/hg2git.txt +++ b/hg2git.txt @@ -35,15 +35,6 @@ git-fast-import. This also implies that it heavily relies on strictly linear ordering of changesets from hg, i.e. its append-only storage model so that changesets hg2git already saw never get modified. -Import and SHA stability -======================== - -Currently it's only supported to map one hg repository to one git -repository. However, all forks of a hg repo can be imported into one git -repo each and then merged together (e.g. as different branches in the -final git repo) since the checksums are stable, i.e. one particular hg -changeset always produces the same git SHA1 checksum. - Todo ==== From 191928202b5156a466f2c3ec200ae7058e4ea889 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Mon, 12 Mar 2007 11:13:48 +0000 Subject: [PATCH 31/38] hg2git.py: Don't complain die for non-existent heads Previously, when no head was present under .git/refs/heads, we simply died as we couldn't open the file. Now, simply return None in case we cannot read from it. Signed-off-by: Rocco Rutte --- hg2git.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/hg2git.py b/hg2git.py index 9ac395b..e098c3d 100644 --- a/hg2git.py +++ b/hg2git.py @@ -307,10 +307,13 @@ def save_cache(filename,cache): def verify_heads(ui,repo,cache): def getsha1(branch): - f=open(os.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch) - sha1=f.readlines()[0].split('\n')[0] - f.close() - return sha1 + try: + f=open(os.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch) + sha1=f.readlines()[0].split('\n')[0] + f.close() + return sha1 + except IOError: + return None # get list of hg's branches to verify, don't take all git has branches=repo.branchtags() From 5732cd0313c28dbd5c1868525b0659946e05992e Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Tue, 13 Mar 2007 10:59:22 +0000 Subject: [PATCH 32/38] hg2git.py: For the first revision, feed out full manifest For the mutt and hg repos, it didn't make a difference, but attempting to run the conversion on the opensolaris repo looks like this is needed. When we attempt to export some commit, special-case the revision number 0 and export all files the manifest has while labeling this a "full revision export". Otherwise we do what we did before labeling this a "delta revision export". Signed-off-by: Rocco Rutte --- hg2git.py | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/hg2git.py b/hg2git.py index e098c3d..acb3d4a 100644 --- a/hg2git.py +++ b/hg2git.py @@ -24,6 +24,8 @@ user_clean_re=re.compile('^["]([^"]+)["]$') cfg_master='master' # insert 'checkpoint' command after this many commits or none at all if 0 cfg_checkpoint_count=0 +# write some progress message every this many file contents written +cfg_export_boundary=1000 def usage(ret): sys.stderr.write(__doc__) @@ -163,6 +165,21 @@ def get_author(logmessage,committer,authors): return r return committer +def export_file_contents(ctx,manifest,files): + count=0 + max=len(files) + for file in files: + fctx=ctx.filectx(file) + d=fctx.data() + wr('M %s inline %s' % (gitmode(manifest.execf(file)),file)) + wr('data %d' % len(d)) # had some trouble with size() + wr(d) + count+=1 + if count%cfg_export_boundary==0: + sys.stderr.write('Exported %d/%d files\n' % (count,max)) + if max>cfg_export_boundary: + sys.stderr.write('Exported %d/%d files\n' % (count,max)) + def export_commit(ui,repo,revision,marks,heads,last,max,count,authors,sob): (_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors) parents=repo.changelog.parentrevs(revision) @@ -219,20 +236,20 @@ def export_commit(ui,repo,revision,marks,heads,last,max,count,authors,sob): ctx=repo.changectx(str(revision)) man=ctx.manifest() - added,changed,removed=get_filechanges(repo,revision,parents,man) - sys.stderr.write('Exporting revision %d/%d with %d/%d/%d added/changed/removed files\n' % - (revision,max,len(added),len(changed),len(removed))) - - for a in added+changed: - fctx=ctx.filectx(a) - d=fctx.data() - wr('M %s inline %s' % (gitmode(man.execf(a)),a)) - wr('data %d' % len(d)) # had some trouble with size() - wr(d) - - for r in removed: - wr('D %s' % r) + if revision==0: + # first revision: feed in full manifest + sys.stderr.write('Exporting full revision %d/%d with %d added files\n' % + (revision,max,len(man.keys()))) + export_file_contents(ctx,man,man.keys()) + else: + # later revision: feed in changed manifest + added,changed,removed=get_filechanges(repo,revision,parents,man) + sys.stderr.write('Exporting delta revision %d/%d with %d/%d/%d added/changed/removed files\n' % + (revision,max,len(added),len(changed),len(removed))) + export_file_contents(ctx,man,added+changed) + for r in removed: + wr('D %s' % r) wr() return checkpoint(count) From d9bb3271a4b739456f825979597480d650526ed2 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Tue, 13 Mar 2007 15:27:29 +0000 Subject: [PATCH 33/38] Add a note about hg's unnamed branches and multiple heads Signed-off-by: Rocco Rutte --- hg2git.py | 2 +- hg2git.txt | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/hg2git.py b/hg2git.py index acb3d4a..690acda 100644 --- a/hg2git.py +++ b/hg2git.py @@ -77,7 +77,7 @@ def gitmode(x): def wr(msg=''): print msg - #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n')) + map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n')) def checkpoint(count): count=count+1 diff --git a/hg2git.txt b/hg2git.txt index 13a2035..f4da258 100644 --- a/hg2git.txt +++ b/hg2git.txt @@ -15,12 +15,16 @@ Using it is quite simple for a mercurial repository : mkdir repo-git # or whatever cd repo-git git init - hg2git.sh + hg2git.sh -r Incremental imports to track hg repos is supported, too. -Notes -===== +Notes/Limitations +================= + +hg2git supports multiple branches but only named branches with exaclty +one head each. Otherwise commits to the tip of these heads within branch +will get flattened into merge commits. As each git-fast-import run creates a new pack file, it may be required to repack the repository quite often for incremental imports (especially From ad283a91ca43259f2b8778035daf49326646a572 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Tue, 13 Mar 2007 16:31:57 +0000 Subject: [PATCH 34/38] hg2git.py: Bail out for certain errors New is that we also check for multiple tips having the same branch name, i.e. no unnamed heads. Signed-off-by: Rocco Rutte --- hg2git.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/hg2git.py b/hg2git.py index 690acda..f5f06c3 100644 --- a/hg2git.py +++ b/hg2git.py @@ -65,7 +65,7 @@ def get_branch(name): name=cfg_master return name -def get_changeset(ui,repo,revision,authors): +def get_changeset(ui,repo,revision,authors={}): node=repo.lookup(revision) (manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node) tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60)) @@ -332,19 +332,31 @@ def verify_heads(ui,repo,cache): except IOError: return None - # get list of hg's branches to verify, don't take all git has branches=repo.branchtags() l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()] l.sort() + # get list of hg's branches to verify, don't take all git has for _,_,b in l: b=get_branch(b) sys.stderr.write('Verifying branch [%s]\n' % b) sha1=getsha1(b) c=cache.get(b) if sha1!=c: - sys.stderr.write('Warning: Branch [%s] modified outside hg2git:' + sys.stderr.write('Error: Branch [%s] modified outside hg2git:' '\n%s (repo) != %s (cache)\n' % (b,sha1,c)) + return False + + # verify that branch has exactly one head + t={} + for h in repo.heads(): + (_,_,_,_,_,branch,_)=get_changeset(ui,repo,h) + if t.get(branch,False): + sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' % + repo.changelog.rev(h)) + return False + t[branch]=True + return True def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={},sob=False): From d988112549d2e4145f3da4f2afa5110f81451ada Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Tue, 13 Mar 2007 16:43:20 +0000 Subject: [PATCH 35/38] hg2git.py: add -f/--force option to bypass validation checks Signed-off-by: Rocco Rutte --- hg2git.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/hg2git.py b/hg2git.py index f5f06c3..37717be 100644 --- a/hg2git.py +++ b/hg2git.py @@ -77,7 +77,7 @@ def gitmode(x): def wr(msg=''): print msg - map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n')) + #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n')) def checkpoint(count): count=count+1 @@ -240,13 +240,13 @@ def export_commit(ui,repo,revision,marks,heads,last,max,count,authors,sob): if revision==0: # first revision: feed in full manifest sys.stderr.write('Exporting full revision %d/%d with %d added files\n' % - (revision,max,len(man.keys()))) + (revision+1,max,len(man.keys()))) export_file_contents(ctx,man,man.keys()) else: # later revision: feed in changed manifest added,changed,removed=get_filechanges(repo,revision,parents,man) sys.stderr.write('Exporting delta revision %d/%d with %d/%d/%d added/changed/removed files\n' % - (revision,max,len(added),len(changed),len(removed))) + (revision+1,max,len(added),len(changed),len(removed))) export_file_contents(ctx,man,added+changed) for r in removed: wr('D %s' % r) @@ -322,7 +322,7 @@ def save_cache(filename,cache): map(lambda x: f.write(':%s %s\n' % (str(x),str(cache.get(x)))),cache.keys()) f.close() -def verify_heads(ui,repo,cache): +def verify_heads(ui,repo,cache,force): def getsha1(branch): try: f=open(os.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch) @@ -345,7 +345,7 @@ def verify_heads(ui,repo,cache): if sha1!=c: sys.stderr.write('Error: Branch [%s] modified outside hg2git:' '\n%s (repo) != %s (cache)\n' % (b,sha1,c)) - return False + if not force: return False # verify that branch has exactly one head t={} @@ -354,12 +354,12 @@ def verify_heads(ui,repo,cache): if t.get(branch,False): sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' % repo.changelog.rev(h)) - return False + if not force: return False t[branch]=True return True -def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={},sob=False): +def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={},sob=False,force=False): _max=int(m) marks_cache=load_cache(marksfile) @@ -368,7 +368,7 @@ def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={},sob=False): ui,repo=setup_repo(repourl) - if not verify_heads(ui,repo,heads_cache): + if not verify_heads(ui,repo,heads_cache,force): return 1 tip=repo.changelog.count() @@ -415,6 +415,8 @@ if __name__=='__main__': default=False,help="Enable parsing Signed-off-by lines") parser.add_option("-A","--authors",dest="authorfile", help="Read authormap from AUTHORFILE") + parser.add_option("-f","--force",action="store_true",dest="force", + default=False,help="Ignore validation errors by force") (options,args)=parser.parse_args() @@ -431,4 +433,4 @@ if __name__=='__main__': a=load_authors(options.authorfile) sys.exit(hg2git(options.repourl,m,options.marksfile,options.headsfile, - options.statusfile,authors=a,sob=options.sob)) + options.statusfile,authors=a,sob=options.sob,force=options.force)) From af2237607c130c1a1744e7efd0ab1876b257bcbd Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Wed, 14 Mar 2007 08:34:18 +0000 Subject: [PATCH 36/38] hg2git.py: Create only leightweight tags The annotated tag with commit message summary was primarily only for debugging. Signed-off-by: Rocco Rutte --- hg2git.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/hg2git.py b/hg2git.py index 37717be..966ee63 100644 --- a/hg2git.py +++ b/hg2git.py @@ -263,20 +263,14 @@ def export_tags(ui,repo,marks_cache,start,end,count,authors): # ignore those tags not in our import range if rev=end: continue - ref=marks_cache.get(str(rev),None) + ref=get_parent_mark(rev,marks_cache) if ref==None: sys.stderr.write('Failed to find reference for creating tag' ' %s at r%d\n' % (tag,rev)) continue - (_,user,(time,timezone),_,desc,branch,_)=get_changeset(ui,repo,rev,authors) sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref)) - wr('tag %s' % tag) + wr('reset refs/tags/%s' % tag) wr('from %s' % ref) - wr('tagger %s %d %s' % (user,time,timezone)) - msg='hg2git created tag %s for hg revision %d on branch %s on (summary):\n\t%s' % (tag, - rev,branch,desc.split('\n')[0]) - wr('data %d' % (len(msg)+1)) - wr(msg) wr() count=checkpoint(count) return count From 287365c1602de984a43437ac1a5ab60186b8a198 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Wed, 14 Mar 2007 10:02:15 +0000 Subject: [PATCH 37/38] hg2git.py: Add simple delta revision feed Now we have three methods of feeding out changes 1) full for first revision or 2) thorough delta for merges (compare checksums with all parents) or 3) simple delta else (only got with manifest) This requires some cleanup so that we have only place where we actually call the appropriate dumping method. The export_file_contents() method now also sorts its file list before writing out anything as this seems to speed up hg data retrival a bit. Signed-off-by: Rocco Rutte --- hg2git.py | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/hg2git.py b/hg2git.py index 966ee63..b411066 100644 --- a/hg2git.py +++ b/hg2git.py @@ -70,7 +70,7 @@ def get_changeset(ui,repo,revision,authors={}): (manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node) tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60)) branch=get_branch(extra.get('branch','master')) - return (manifest,fixup_user(user,authors),(time,tz),files,desc,branch,extra) + return (node,manifest,fixup_user(user,authors),(time,tz),files,desc,branch,extra) def gitmode(x): return x and '100755' or '100644' @@ -167,6 +167,7 @@ def get_author(logmessage,committer,authors): def export_file_contents(ctx,manifest,files): count=0 + files.sort() max=len(files) for file in files: fctx=ctx.filectx(file) @@ -180,8 +181,15 @@ def export_file_contents(ctx,manifest,files): if max>cfg_export_boundary: sys.stderr.write('Exported %d/%d files\n' % (count,max)) +def is_merge(parents): + c=0 + for parent in parents: + if parent>=0: + c+=1 + return c>1 + def export_commit(ui,repo,revision,marks,heads,last,max,count,authors,sob): - (_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors) + (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors) parents=repo.changelog.parentrevs(revision) wr('commit refs/heads/%s' % branch) @@ -236,22 +244,33 @@ def export_commit(ui,repo,revision,marks,heads,last,max,count,authors,sob): ctx=repo.changectx(str(revision)) man=ctx.manifest() + added,changed,removed,type=[],[],[],'' if revision==0: # first revision: feed in full manifest - sys.stderr.write('Exporting full revision %d/%d with %d added files\n' % - (revision+1,max,len(man.keys()))) - export_file_contents(ctx,man,man.keys()) - else: - # later revision: feed in changed manifest + added=man.keys() + type='full' + elif is_merge(parents): + # later merge revision: feed in changed manifest + # for many files comparing checksums is expensive so only do it for + # merges where we really need it due to hg's revlog logic added,changed,removed=get_filechanges(repo,revision,parents,man) - sys.stderr.write('Exporting delta revision %d/%d with %d/%d/%d added/changed/removed files\n' % - (revision+1,max,len(added),len(changed),len(removed))) - export_file_contents(ctx,man,added+changed) - for r in removed: - wr('D %s' % r) + type='thorough delta' + else: + # later non-merge revision: feed in changed manifest + # if we have exactly one parent, just take the changes from the + # manifest without expensively comparing checksums + f=repo.status(repo.lookup(parents[0]),revnode)[:3] + added,changed,removed=f[1],f[0],f[2] + type='simple delta' + sys.stderr.write('Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' % + (type,revision+1,max,len(added),len(changed),len(removed))) + + map(lambda r: wr('D %s' % r),removed) + export_file_contents(ctx,man,added+changed) wr() + return checkpoint(count) def export_tags(ui,repo,marks_cache,start,end,count,authors): @@ -344,7 +363,7 @@ def verify_heads(ui,repo,cache,force): # verify that branch has exactly one head t={} for h in repo.heads(): - (_,_,_,_,_,branch,_)=get_changeset(ui,repo,h) + (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h) if t.get(branch,False): sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' % repo.changelog.rev(h)) From f9879136a954dadc34e6ab7c8d9ae5e7e2154988 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Wed, 14 Mar 2007 10:13:27 +0000 Subject: [PATCH 38/38] hg2git.py: Only print verification message for branches we have It's pointless for many branches to print the validation message for the first revision already; the same counts for incremental runs. Signed-off-by: Rocco Rutte --- hg2git.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hg2git.py b/hg2git.py index b411066..eb927a4 100644 --- a/hg2git.py +++ b/hg2git.py @@ -352,9 +352,10 @@ def verify_heads(ui,repo,cache,force): # get list of hg's branches to verify, don't take all git has for _,_,b in l: b=get_branch(b) - sys.stderr.write('Verifying branch [%s]\n' % b) sha1=getsha1(b) c=cache.get(b) + if sha1!=None and c!=None: + sys.stderr.write('Verifying branch [%s]\n' % b) if sha1!=c: sys.stderr.write('Error: Branch [%s] modified outside hg2git:' '\n%s (repo) != %s (cache)\n' % (b,sha1,c))