Merge branch 'master' of git://repo.or.cz/hg2git

* 'master' of git://repo.or.cz/hg2git: (38 commits)
  hg2git.py: Only print verification message for branches we have
  hg2git.py: Add simple delta revision feed
  hg2git.py: Create only leightweight tags
  hg2git.py: add -f/--force option to bypass validation checks
  hg2git.py: Bail out for certain errors
  Add a note about hg's unnamed branches and multiple heads
  hg2git.py: For the first revision, feed out full manifest
  hg2git.py: Don't complain die for non-existent heads
  Remove SHA stability note from readme
  hg2git.py: Remove leading/trailing spaces from authormap
  hg2git.py: Allow for spaces in authorfile
  hg2git.sh: Add usage note that argument order matters
  hg2git.sh: Complete --help output's option listing
  Basic support for an author map
  hg2git.py: Fix typo saving status to headsfile instead of statusfile
  hg2git.py: Display our max revision as progress, not tip
  hg2git.py: Disable parsing Signef-off-by lines and add -s to enable
  Basic support for command line options in hg2git.py
  Rename README.txt to hg2git.txt
  Only attempt to verify heads hg has, too
  ...
This commit is contained in:
Rocco Rutte
2007-03-14 10:22:52 +00:00
3 changed files with 600 additions and 0 deletions

450
hg2git.py Normal file
View File

@@ -0,0 +1,450 @@
#!/usr/bin/env python
# Copyright (c) 2007 Rocco Rutte <pdmef@gmx.net>
# License: GPLv2
"""hg2git.py - A mercurial-to-git filter for git-fast-import(1)
Usage: hg2git.py <hg repo url> <marks file> <heads file> <tip file>
"""
from mercurial import repo,hg,cmdutil,util,ui,revlog,node
from tempfile import mkstemp
from optparse import OptionParser
import re
import sys
import os
# silly regex to catch Signed-off-by lines in log message
sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
# silly regex to see if user field has email address
user_re=re.compile('([^<]+) (<[^>]+>)$')
# silly regex to clean out user names
user_clean_re=re.compile('^["]([^"]+)["]$')
# git branch for hg's default 'HEAD' branch
cfg_master='master'
# insert 'checkpoint' command after this many commits or none at all if 0
cfg_checkpoint_count=0
# write some progress message every this many file contents written
cfg_export_boundary=1000
def usage(ret):
sys.stderr.write(__doc__)
return ret
def setup_repo(url):
myui=ui.ui()
return myui,hg.repository(myui,url)
def fixup_user(user,authors):
if authors!=None:
# if we have an authors table, try to get mapping
# by defaulting to the current value of 'user'
user=authors.get(user,user)
name,mail,m='','',user_re.match(user)
if m==None:
# if we don't have 'Name <mail>' syntax, use 'user
# <devnull@localhost>' if use contains no at and
# 'user <user>' otherwise
name=user
if '@' not in user:
mail='<devnull@localhost>'
else:
mail='<%s>' % user
else:
# if we have 'Name <mail>' syntax, everything is fine :)
name,mail=m.group(1),m.group(2)
# remove any silly quoting from username
m2=user_clean_re.match(name)
if m2!=None:
name=m2.group(1)
return '%s %s' % (name,mail)
def get_branch(name):
if name=='HEAD':
name=cfg_master
return name
def get_changeset(ui,repo,revision,authors={}):
node=repo.lookup(revision)
(manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node)
tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60))
branch=get_branch(extra.get('branch','master'))
return (node,manifest,fixup_user(user,authors),(time,tz),files,desc,branch,extra)
def gitmode(x):
return x and '100755' or '100644'
def wr(msg=''):
print msg
#map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
def checkpoint(count):
count=count+1
if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
sys.stderr.write("Checkpoint after %d commits\n" % count)
wr('checkpoint')
wr()
return count
def get_parent_mark(parent,marks):
"""Get the mark for some parent.
If we saw it in the current session, return :%d syntax and
otherwise the SHA1 from the cache."""
return marks.get(str(parent+1),':%d' % (parent+1))
def mismatch(f1,f2):
"""See if two revisions of a file are not equal."""
return node.hex(f1)!=node.hex(f2)
def outer_set(dleft,dright,l,c,r):
"""Loop over our repository and find all changed and missing files."""
for left in dleft.keys():
right=dright.get(left,None)
if right==None:
# we have the file but our parent hasn't: add to left set
l.append(left)
elif mismatch(dleft[left],right):
# we have it but checksums mismatch: add to center set
c.append(left)
for right in dright.keys():
left=dleft.get(right,None)
if left==None:
# if parent has file but we don't: add to right set
r.append(right)
# change is already handled when comparing child against parent
return l,c,r
def get_filechanges(repo,revision,parents,mleft):
"""Given some repository and revision, find all changed/deleted files."""
l,c,r=[],[],[]
for p in parents:
if p<0: continue
mright=repo.changectx(p).manifest()
dleft=mleft.keys()
dleft.sort()
dright=mright.keys()
dright.sort()
l,c,r=outer_set(mleft,mright,l,c,r)
return l,c,r
def get_author(logmessage,committer,authors):
"""As git distincts between author and committer of a patch, try to
extract author by detecting Signed-off-by lines.
This walks from the end of the log message towards the top skipping
empty lines. Upon the first non-empty line, it walks all Signed-off-by
lines upwards to find the first one. For that (if found), it extracts
authorship information the usual way (authors table, cleaning, etc.)
If no Signed-off-by line is found, this defaults to the committer.
This may sound stupid (and it somehow is), but in log messages we
accidentially may have lines in the middle starting with
"Signed-off-by: foo" and thus matching our detection regex. Prevent
that."""
loglines=logmessage.split('\n')
i=len(loglines)
# from tail walk to top skipping empty lines
while i>=0:
i-=1
if len(loglines[i].strip())==0: continue
break
if i>=0:
# walk further upwards to find first sob line, store in 'first'
first=None
while i>=0:
m=sob_re.match(loglines[i])
if m==None: break
first=m
i-=1
# if the last non-empty line matches our Signed-Off-by regex: extract username
if first!=None:
r=fixup_user(first.group(1),authors)
return r
return committer
def export_file_contents(ctx,manifest,files):
count=0
files.sort()
max=len(files)
for file in files:
fctx=ctx.filectx(file)
d=fctx.data()
wr('M %s inline %s' % (gitmode(manifest.execf(file)),file))
wr('data %d' % len(d)) # had some trouble with size()
wr(d)
count+=1
if count%cfg_export_boundary==0:
sys.stderr.write('Exported %d/%d files\n' % (count,max))
if max>cfg_export_boundary:
sys.stderr.write('Exported %d/%d files\n' % (count,max))
def is_merge(parents):
c=0
for parent in parents:
if parent>=0:
c+=1
return c>1
def export_commit(ui,repo,revision,marks,heads,last,max,count,authors,sob):
(revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
parents=repo.changelog.parentrevs(revision)
wr('commit refs/heads/%s' % branch)
wr('mark :%d' % (revision+1))
if sob:
wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
wr('committer %s %d %s' % (user,time,timezone))
wr('data %d' % (len(desc)+1)) # wtf?
wr(desc)
wr()
src=heads.get(branch,'')
link=''
if src!='':
# if we have a cached head, this is an incremental import: initialize it
# and kill reference so we won't init it again
wr('from %s' % src)
heads[branch]=''
sys.stderr.write('Initializing branch [%s] to parent [%s]\n' %
(branch,src))
link=src # avoid making a merge commit for incremental import
elif link=='' and not heads.has_key(branch) and revision>0:
# newly created branch and not the first one: connect to parent
tmp=get_parent_mark(parents[0],marks)
wr('from %s' % tmp)
sys.stderr.write('Link new branch [%s] to parent [%s]\n' %
(branch,tmp))
link=tmp # avoid making a merge commit for branch fork
if parents:
l=last.get(branch,revision)
for p in parents:
# 1) as this commit implicitely is the child of the most recent
# commit of this branch, ignore this parent
# 2) ignore nonexistent parents
# 3) merge otherwise
if p==l or p==revision or p<0:
continue
tmp=get_parent_mark(p,marks)
# if we fork off a branch, don't merge with our parent via 'merge'
# as we have 'from' already above
if tmp==link:
continue
sys.stderr.write('Merging branch [%s] with parent [%s] from [r%d]\n' %
(branch,tmp,p))
wr('merge %s' % tmp)
last[branch]=revision
heads[branch]=''
# we need this later to write out tags
marks[str(revision)]=':%d'%(revision+1)
ctx=repo.changectx(str(revision))
man=ctx.manifest()
added,changed,removed,type=[],[],[],''
if revision==0:
# first revision: feed in full manifest
added=man.keys()
type='full'
elif is_merge(parents):
# later merge revision: feed in changed manifest
# for many files comparing checksums is expensive so only do it for
# merges where we really need it due to hg's revlog logic
added,changed,removed=get_filechanges(repo,revision,parents,man)
type='thorough delta'
else:
# later non-merge revision: feed in changed manifest
# if we have exactly one parent, just take the changes from the
# manifest without expensively comparing checksums
f=repo.status(repo.lookup(parents[0]),revnode)[:3]
added,changed,removed=f[1],f[0],f[2]
type='simple delta'
sys.stderr.write('Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
(type,revision+1,max,len(added),len(changed),len(removed)))
map(lambda r: wr('D %s' % r),removed)
export_file_contents(ctx,man,added+changed)
wr()
return checkpoint(count)
def export_tags(ui,repo,marks_cache,start,end,count,authors):
l=repo.tagslist()
for tag,node in l:
# ignore latest revision
if tag=='tip': continue
rev=repo.changelog.rev(node)
# ignore those tags not in our import range
if rev<start or rev>=end: continue
ref=get_parent_mark(rev,marks_cache)
if ref==None:
sys.stderr.write('Failed to find reference for creating tag'
' %s at r%d\n' % (tag,rev))
continue
sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
wr('reset refs/tags/%s' % tag)
wr('from %s' % ref)
wr()
count=checkpoint(count)
return count
def load_authors(filename):
cache={}
if not os.path.exists(filename):
return cache
f=open(filename,'r')
l=0
lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
for line in f.readlines():
l+=1
m=lre.match(line)
if m==None:
sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
continue
# put key:value in cache, key without ^:
cache[m.group(1).strip()]=m.group(2).strip()
f.close()
sys.stderr.write('Loaded %d authors\n' % l)
return cache
def load_cache(filename):
cache={}
if not os.path.exists(filename):
return cache
f=open(filename,'r')
l=0
for line in f.readlines():
l+=1
fields=line.split(' ')
if fields==None or not len(fields)==2 or fields[0][0]!=':':
sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
continue
# put key:value in cache, key without ^:
cache[fields[0][1:]]=fields[1].split('\n')[0]
f.close()
return cache
def save_cache(filename,cache):
f=open(filename,'w+')
map(lambda x: f.write(':%s %s\n' % (str(x),str(cache.get(x)))),cache.keys())
f.close()
def verify_heads(ui,repo,cache,force):
def getsha1(branch):
try:
f=open(os.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch)
sha1=f.readlines()[0].split('\n')[0]
f.close()
return sha1
except IOError:
return None
branches=repo.branchtags()
l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
l.sort()
# get list of hg's branches to verify, don't take all git has
for _,_,b in l:
b=get_branch(b)
sha1=getsha1(b)
c=cache.get(b)
if sha1!=None and c!=None:
sys.stderr.write('Verifying branch [%s]\n' % b)
if sha1!=c:
sys.stderr.write('Error: Branch [%s] modified outside hg2git:'
'\n%s (repo) != %s (cache)\n' % (b,sha1,c))
if not force: return False
# verify that branch has exactly one head
t={}
for h in repo.heads():
(_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
if t.get(branch,False):
sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
repo.changelog.rev(h))
if not force: return False
t[branch]=True
return True
def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={},sob=False,force=False):
_max=int(m)
marks_cache=load_cache(marksfile)
heads_cache=load_cache(headsfile)
state_cache=load_cache(tipfile)
ui,repo=setup_repo(repourl)
if not verify_heads(ui,repo,heads_cache,force):
return 1
tip=repo.changelog.count()
min=int(state_cache.get('tip',0))
max=_max
if _max<0:
max=tip
c=0
last={}
for rev in range(min,max):
c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,max,c,authors,sob)
c=export_tags(ui,repo,marks_cache,min,max,c,authors)
sys.stderr.write('Issued %d commands\n' % c)
state_cache['tip']=max
state_cache['repo']=repourl
save_cache(tipfile,state_cache)
return 0
if __name__=='__main__':
def bail(parser,opt):
sys.stderr.write('Error: No %s option given\n' % opt)
parser.print_help()
sys.exit(2)
parser=OptionParser()
parser.add_option("-m","--max",type="int",dest="max",
help="Maximum hg revision to import")
parser.add_option("--marks",dest="marksfile",
help="File to read git-fast-import's marks from")
parser.add_option("--heads",dest="headsfile",
help="File to read last run's git heads from")
parser.add_option("--status",dest="statusfile",
help="File to read status from")
parser.add_option("-r","--repo",dest="repourl",
help="URL of repo to import")
parser.add_option("-s",action="store_true",dest="sob",
default=False,help="Enable parsing Signed-off-by lines")
parser.add_option("-A","--authors",dest="authorfile",
help="Read authormap from AUTHORFILE")
parser.add_option("-f","--force",action="store_true",dest="force",
default=False,help="Ignore validation errors by force")
(options,args)=parser.parse_args()
m=-1
if options.max!=None: m=options.max
if options.marksfile==None: bail(parser,'--marks')
if options.marksfile==None: bail(parser,'--heads')
if options.marksfile==None: bail(parser,'--status')
if options.marksfile==None: bail(parser,'--repo')
a={}
if options.authorfile!=None:
a=load_authors(options.authorfile)
sys.exit(hg2git(options.repourl,m,options.marksfile,options.headsfile,
options.statusfile,authors=a,sob=options.sob,force=options.force))

93
hg2git.sh Executable file
View File

@@ -0,0 +1,93 @@
#!/bin/sh
ROOT="`dirname $0`"
REPO=""
PFX="hg2git"
SFX_MARKS="marks"
SFX_HEADS="heads"
SFX_STATE="state"
QUIET=""
USAGE="[--quiet] [-r <repo>] [-m <max>] [-s] [-A <file>]"
LONG_USAGE="Import hg repository <repo> up to either tip or <max>
If <repo> is omitted, use last hg repository as obtained from state file,
GIT_DIR/$PFX-$SFX_STATE by default.
Note: The argument order matters.
Options:
-m Maximum revision to import
--quiet Passed to git-fast-import(1)
-s Enable parsing Signed-off-by lines
-A Read author map from file
(Same as in git-svnimport(1) and git-cvsimport(1))
-r Mercurial repository to import
"
. git-sh-setup
cd_to_toplevel
while case "$#" in 0) break ;; esac
do
case "$1" in
-r|--r|--re|--rep|--repo)
shift
REPO="$1"
;;
--q|--qu|--qui|--quie|--quiet)
QUIET="--quiet"
;;
-*)
# pass any other options down to hg2git.py
break
;;
*)
break
;;
esac
shift
done
# for convenience: get default repo from state file
if [ x"$REPO" = x -a -f "$GIT_DIR/$PFX-$SFX_STATE" ] ; then
REPO="`egrep '^:repo ' "$GIT_DIR/$PFX-$SFX_STATE" | cut -d ' ' -f 2`"
echo "Using last hg repository \"$REPO\""
fi
# make sure we have a marks cache
if [ ! -f "$GIT_DIR/$PFX-$SFX_MARKS" ] ; then
touch "$GIT_DIR/$PFX-$SFX_MARKS"
fi
GIT_DIR="$GIT_DIR" python "$ROOT/hg2git.py" \
--repo "$REPO" \
--marks "$GIT_DIR/$PFX-$SFX_MARKS" \
--heads "$GIT_DIR/$PFX-$SFX_HEADS" \
--status "$GIT_DIR/$PFX-$SFX_STATE" \
"$@" \
| git-fast-import $QUIET --export-marks="$GIT_DIR/$PFX-$SFX_MARKS.tmp" \
|| die 'Git fast-import failed'
# move recent marks cache out of the way...
if [ -f "$GIT_DIR/$PFX-$SFX_MARKS" ] ; then
mv "$GIT_DIR/$PFX-$SFX_MARKS" "$GIT_DIR/$PFX-$SFX_MARKS.old"
else
touch "$GIT_DIR/$PFX-$SFX_MARKS.old"
fi
# ...to create a new merged one
cat "$GIT_DIR/$PFX-$SFX_MARKS.old" "$GIT_DIR/$PFX-$SFX_MARKS.tmp" \
| uniq > "$GIT_DIR/$PFX-$SFX_MARKS"
# cleanup
rm -rf "$GIT_DIR/$PFX-$SFX_MARKS.old" "$GIT_DIR/$PFX-$SFX_MARKS.tmp"
# save SHA1s of current heads for incremental imports
# and connectivity (plus sanity checking)
for head in `git branch | sed 's#^..##'` ; do
id="`git-rev-parse $head`"
echo ":$head $id"
done > "$GIT_DIR/$PFX-$SFX_HEADS"
# check diff with color:
# ( for i in `find . -type f | grep -v '\.git'` ; do diff -u $i $REPO/$i ; done | cdiff ) | less -r

57
hg2git.txt Normal file
View File

@@ -0,0 +1,57 @@
hg2git.(sh|py) - mercurial to git converter using git-fast-import
Legal
=====
The scripts are licensed under the GPL version 2 and were written by
Rocco Rutte <pdmef@gmx.net> with hints and help from the git list and
#mercurial on freenode.
Usage
=====
Using it is quite simple for a mercurial repository <repo>:
mkdir repo-git # or whatever
cd repo-git
git init
hg2git.sh -r <repo>
Incremental imports to track hg repos is supported, too.
Notes/Limitations
=================
hg2git supports multiple branches but only named branches with exaclty
one head each. Otherwise commits to the tip of these heads within branch
will get flattened into merge commits.
As each git-fast-import run creates a new pack file, it may be required
to repack the repository quite often for incremental imports (especially
when importing a small number of changesets per incremental import).
Design
======
hg2git.py was designed in a way that doesn't require a 2-pass mechanism
or any prior repository analysis: if just feeds what it finds into
git-fast-import. This also implies that it heavily relies on strictly
linear ordering of changesets from hg, i.e. its append-only storage
model so that changesets hg2git already saw never get modified.
Todo
====
For incremental imports, handling tags needs to be reworked (maybe):
Right now we assume that once a tag is created, it stays forever and
never changes. However,
1) tags in hg may be removed
2) tags may change
I'm not yet sure how to handle this and how this interferes with
non-hg-based tags in git.
The same for branches: They may get removed.
For one-time conversions, everything is fine.