diff --git a/dev-tools/scripts/svnBranchToGit.py b/dev-tools/scripts/svnBranchToGit.py index 8c6c77bc8e8..49af3bac7ad 100644 --- a/dev-tools/scripts/svnBranchToGit.py +++ b/dev-tools/scripts/svnBranchToGit.py @@ -13,23 +13,23 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" Workaround for slow updates from svn to git. +""" Workaround for slow updates from an svn branch to git. See also jira issue INFRA-9182 Situation: -Remove svn repo ---> slow git-svn update process ---> Remote git-svn repo (upstream) +Remote svn repo ---> (slow) git-svn fetch ---> Remote git repo (upstream) | | | | v v -Local svn working copy --> this workaround ---> Local git repo +Local svn working copy ---> this workaround ---> Local git repo -Because of the slow remote git-svn update process the remote git repo is (far) behind +When the remote git-svn fetch is slow, the remote git repo is behind the remote svn repo. -When this script is run it will first check that the local repositories are clean. -Then it switches the svn working copy to branch, which updates from the remote. -Then it fetches branch from the git upstream repo, and merges the branch locally. +When this script is run it will first check that the local working copy and repository are clean. +Then it switches the svn working copy to the branch, which updates from the remote. +Then it the fetches branch from the git upstream repo, and merges the branch locally. Normally the local svn and git will then be at the same svn revision, and the script will exit. Otherwise the remote git repo is out of date, and the following happens. @@ -38,8 +38,12 @@ For the branch branchname in a local git repository following an upstream git-sv this maintains commits on a temporary git branch branchname.svn in the local git repository. These commits contain a message ending like this: "SvnRepoUrl diff -r EarlierSvnRevisionNumber:NextSvnRevisionNumber". -Otherwise the added commit messages look a lot like their counterparts from git svn, -only the committer is taken from the local git settings. +Otherwise the messages of the added commits are the same as their counterparts from git svn. + +Normally the added git commits and their git-svn counterparts have no differences between their working trees. +However such differences can occur, see also the documentation of git-svn reset and the limitations below. +In order not to interfere with git-svn this script only adds commits to a temporary branch +branchname.svn, and the commit messages are chosen differently, they do not contain git-svn-id. In case an earlier branchname.svn exists, it will first be deleted if necessary, and restarted at the later branch. @@ -48,41 +52,38 @@ Therefore branchname.svn is temporary and should only be used locally. By default, no more than 20 commits will be added to branchname.svn in a single run. The earlier revision number is taken from the git-svn-id message of git svn, -or from the LatestSvnRevisionNumber in the commit message of branchname.svn, +or from the latest revision number in the commit message on branchname.svn, whichever is later. This allows branchname.svn to be used as a local git branch instead of branchname -to develop new features locally, usually by mering branchname.svn into a feature branch. - -In more detail: - - switch the svn working copy to the branch, updating it to the latest revision, - - in the git repo: - - fetch the git repository from upstream. - - merge branchname from upstream/branchname, this is the branch that can be (far) behind. - - use the git-svn-id from the latest git commit on this branch to determine the corresponding svn revision. - - if the branchname.svn exists determine the latest svn revision from there. - - choose the latest svn revision number available. - - compare the git-svn revision to the svn latest revision (delay deleting a too early branchname.svn to later below). - - when the git repository has the same revision: - - exit reporting that branchname is up to date. - - when the git repository has an earlier revision: - - in the git working tree: - - if branchname.svn is not at the earlier svn revision number: - - delete branchname.svn - - recreate branch branchname.svn from branchname. - - check out branchname.svn - - get the svn commits from the latest available git svn commit (possible generated here), this uses the remote svn repo, - to the latest one from the svn log (but no more than the maximum): - - for all these commits: - - from the svn working copy, create a patch for the svn commit into file ~/patches/branchname.svn, - this takes most the the time as it uses the remote svn repo. - - in the git working tree: - - apply the patch ~/patches/branchname.svn, ignoring whitespace differences. - - commit using author, date and message from the svn log, and append the message with revision numbers. +to develop new features locally, for example by merging branchname.svn into a feature branch. """ +""" Limitations: + +This currently works by patching text, and therefore this does not work on binary files. +An example commit in lucene-solr that adds a binary file, on which this currently does not work correctly: +svn revision 1707457 +git commit 3c0390f71e1f08a17f32bc207b4003362f8b6ac2 + +When the local svn working copy contains file after updating to the latest available revision, +and there is an interim commit that deletes this file, this file is left as an empty file in the working directory +of the local git repository. + +All svn properties are ignored here. """ -This was developed on Linux using the following program versions: + +""" To be done: +Take binary files from the patch, and check out binary files directly from the remote svn repo directly into the local git repo. + +Going really far: checkout each interim svn revision, and use all (changed) files from there instead of the text diff. +Determining all files under version control with svn (svn ls) is far too slow for this (esp. when compared to git ls-tree), +so this is probably better done by using svnsync to setup a local mirror repository following the remote, +and then using svnlook on the local mirror repository. +Doing that only to avoid the limitations of this workaround does not appear to be worthwhile. +""" + +""" This was developed on Linux using the following program versions: python 2.7.6 git 1.9.1 svn 1.8.8 @@ -91,8 +92,8 @@ grep (GNU grep) 2.16 gitk (part of git) was used for manual testing: - reset branch to an earlier commit to simulate a non working update from svn to git, - delete branchname.svn, reset branchname.svn to earlier, -- diff a commit generated here to a commit from git svn, diffs between corresponding commits are normally empty, -- update, reload, sort commits by commit date, ... +- diff a commit generated here to a commit from git svn, +- update, reload, show commits by commit date, ... """ import os @@ -107,34 +108,34 @@ class SvnInfoHandler(ContentHandler): revisionAttr = "revision" def __init__(self): - self.lastChangedRevision = None + self.lastChangeRev = None def startElement(self, name, attrs): if name == "commit": self.lastChangeRev = int(attrs.getValue(self.revisionAttr)) - def lastChangeRevision(self): + def getLastChangeRevision(self): return self.lastChangeRev -class SvnLogEntry: +class SvnLogEntry(object): pass # attributes set in SvnLogHandler: revision, author, date, msg class SvnLogHandler(ContentHandler): # collect list of SvnLogEntry's - tagLogEntry = "logentry" - revisionAttr = "revision" - tagAuthor = "author" - tagDate = "date" - tagMsg = "msg" - charCollectTags = (tagAuthor, tagDate, tagMsg) # also used as SvnLogEntry attributes + logEntryTag = "logentry" + revisionAttr = "revision" # also used as SvnLogEntry attribute + authorTag = "author" + dateTag = "date" + msgTag = "msg" + charCollectTags = (authorTag, dateTag, msgTag) # also used as SvnLogEntry attributes def __init__(self): self.logEntries = [] self.chars = None def startElement(self, name, attrs): - if name == self.tagLogEntry: + if name == self.logEntryTag: self.lastLogEntry = SvnLogEntry() setattr(self.lastLogEntry, self.revisionAttr, int(attrs.getValue(self.revisionAttr))) for tag in self.charCollectTags: @@ -154,7 +155,7 @@ class SvnLogHandler(ContentHandler): # collect list of SvnLogEntry's self.chars = None return - if name == self.tagLogEntry: + if name == self.logEntryTag: self.logEntries.append(self.lastLogEntry) self.lastLogEntry = None @@ -162,33 +163,33 @@ class SvnLogHandler(ContentHandler): # collect list of SvnLogEntry's return self.logEntries -class PathName: - def __init__(self, pathName): +class SubProcessAtPath(object): + def __init__(self, pathName, verbose=True): + assert pathName != "" self.pathName = pathName + self.verbose = verbose def getPathName(self): return self.pathName + def chDirToPath(self): + if self.pathName != os.getcwd(): + os.chdir(self.pathName) + assert self.pathName == os.getcwd() + def __str__(self): return self.__class__.__name__ + "(" + self.pathName + ")" - -class SubProcess: - def __init__(self, verbose=True): - self.verbose = verbose - - def check_call(self, *args, **kwArgs): + def checkCall(self, *args, **kwArgs): assert type(*args) != types.StringType + self.chDirToPath() if self.verbose: print "check_call args:", " ".join(*args) subprocess.check_call(*args, **kwArgs) - def check_call_silent(self, *args, **kwArgs): # ignore self.verbose - assert type(*args) != types.StringType - subprocess.check_call(*args, **kwArgs) - - def check_output(self, *args, **kwArgs): + def checkOutput(self, *args, **kwArgs): assert type(*args) != types.StringType + self.chDirToPath() if self.verbose: print "check_output args:", " ".join(*args) result = subprocess.check_output(*args, **kwArgs) @@ -196,38 +197,33 @@ class SubProcess: print "check_output result:", result return result - def check_output_silent(self, *args, **kwArgs): # ignore self.verbose - assert type(*args) != types.StringType - return subprocess.check_output(*args, **kwArgs) - -class SvnWorkingCopy(PathName, SubProcess): +class SvnWorkingCopy(SubProcessAtPath): def __init__(self, pathName): - PathName.__init__(self, pathName) - SubProcess.__init__(self, verbose=False) + SubProcessAtPath.__init__(self, pathName, verbose=False) svnCmd = "svn" def ensureNoLocalModifications(self): - localMods = self.check_output((self.svnCmd, "status", self.pathName)) + localMods = self.checkOutput((self.svnCmd, "status")) if localMods: errorExit(self, "should not have local modifications:\n", localMods) def update(self): - self.check_call((self.svnCmd, "update", self.pathName)) + self.checkCall((self.svnCmd, "update")) def switch(self, repoBranchName): - self.check_call((self.svnCmd, "switch", ("^/" + repoBranchName), self.pathName)) + self.checkCall((self.svnCmd, "switch", ("^/" + repoBranchName))) def lastChangedRevision(self): - infoXml = self.check_output_silent((self.svnCmd, "info", self.pathName, "--xml")) + infoXml = self.checkOutput((self.svnCmd, "info", "--xml")) infoHandler = SvnInfoHandler() sax.parseString(infoXml, infoHandler) - return infoHandler.lastChangeRevision() + return infoHandler.getLastChangeRevision() def getLogEntries(self, fromRevision, toRevision, maxNumLogEntries): revRange = self.revisionsRange(fromRevision, toRevision) - logXml = self.check_output_silent((self.svnCmd, "log", self.pathName, "-r", revRange, "--xml", "-l", str(maxNumLogEntries))) + logXml = self.checkOutput((self.svnCmd, "log", "-r", revRange, "--xml", "-l", str(maxNumLogEntries))) logHandler = SvnLogHandler() sax.parseString(logXml, logHandler) return logHandler.getLogEntries() @@ -240,42 +236,40 @@ class SvnWorkingCopy(PathName, SubProcess): patchFile = open(patchFileName, 'w') try: print "Creating patch from", self.pathName, "between revisions", revRange - self.check_call((self.svnCmd, "diff", "-r", revRange, self.pathName), stdout=patchFile) + self.checkCall((self.svnCmd, "diff", "-r", revRange, + "--ignore-properties"), # git apply can fail on svn properties. + stdout=patchFile) finally: patchFile.close() print "Created patch file", patchFileName - def patchedFileNames(self, patchFileName): # return a sequence of the patched absolute file names + def patchedFileNames(self, patchFileName): # return a sequence of the patched file names + if os.path.getsize(patchFileName) == 0: # changed only svn properties, no files changed. + return [] + indexPrefix = "Index: " regExp = "^" + indexPrefix # at beginning of line - patchedFileNamesLines = self.check_output(("grep", regExp, patchFileName)) + patchedFileNamesLines = self.checkOutput(("grep", regExp, patchFileName)) # grep exits 1 whithout any match. indexPrefixLength = len(indexPrefix) return [line[indexPrefixLength:] for line in patchedFileNamesLines.split("\n") if len(line) > 0] -class GitRepository(PathName, SubProcess): +class GitRepository(SubProcessAtPath): def __init__(self, pathName): - PathName.__init__(self, pathName) - SubProcess.__init__(self, verbose=False) + SubProcessAtPath.__init__(self, pathName, verbose=False) self.currentBranch = None gitCmd = "git" - def _cmdForPath(self): - return (self.gitCmd, "-C", self.pathName) - - def _statusCmd(self): - return (self._cmdForPath() + ("status",)) - def checkOutBranch(self, branchName): - self.check_call(self._cmdForPath() + ("checkout", branchName)) + self.checkCall((self.gitCmd, "checkout", branchName)) self.currentBranch = branchName def getCurrentBranch(self): if self.currentBranch is None: - gitStatusOut = self.check_output(self._statusCmd()) + gitStatusOut = self.checkOutput((self.gitCmd, "status")) if gitStatusOut.startswith("On branch "): self.currentBranch = gitStatusOut.split[2] else: @@ -283,33 +277,49 @@ class GitRepository(PathName, SubProcess): return self.currentBranch def workingDirectoryClean(self): - gitStatusOut = self.check_output(self._statusCmd()) + gitStatusOut = self.checkOutput((self.gitCmd, "status")) expSubString = "nothing to commit, working directory clean" return gitStatusOut.find(expSubString) >= 0 def listBranches(self, pattern): - return self.check_output(self._cmdForPath() + ("branch", "--list", pattern)) + return self.checkOutput((self.gitCmd, "branch", "--list", pattern)) def branchExists(self, branchName): - listOut = self.listBranches(branchName) + listOut = self.listBranches(branchName) # CHECKME: using branchName as pattern may not always be ok. return len(listOut) > 0 def deleteBranch(self, branchName): - self.check_call(self._cmdForPath() + ("branch", "-D", branchName)) + self.checkCall((self.gitCmd, "branch", "-D", branchName)) if branchName == self.currentBranch: self.currentBranch = None def createBranch(self, branchName): - self.check_call(self._cmdForPath() + ("branch", branchName)) + self.checkCall((self.gitCmd, "branch", branchName)) def fetch(self, upStream): - self.check_call(self._cmdForPath() + ("fetch", upStream)) + self.checkCall((self.gitCmd, "fetch", upStream)) def merge(self, branch, fromBranch): - self.check_call(self._cmdForPath() + ("merge", branch, fromBranch)) + self.checkCall((self.gitCmd, "merge", branch, fromBranch)) def getCommitMessage(self, commitRef): - return self.check_output(self._cmdForPath() + ("log", "--format=%B", "-n", "1", commitRef)) + return self.checkOutput((self.gitCmd, "log", "--format=%B", "-n", "1", commitRef)) + + def getCommitAuthorName(self, commitRef): + return self.checkOutput((self.gitCmd, "log", "--format=%aN", "-n", "1", commitRef)) + + def getCommitAuthorEmail(self, commitRef): + return self.checkOutput((self.gitCmd, "log", "--format=%aE", "-n", "1", commitRef)) + + def getLatestCommitForAuthor(self, svnAuthor): + authorCommit = self.checkOutput( + " ".join((self.gitCmd, + "rev-list", "--all", "-i", ("--author=" + svnAuthor), # see git commit documentation on --author + "|", # pipe should have a buffer for at most a few commit ids. + "head", "-1")), + shell=True) # use shell pipe + authorCommit = authorCommit.rstrip("\n") + return authorCommit def getSvnRemoteAndRevision(self, gitSvnCommitRef): gitSvnCommitMessage = self.getCommitMessage(gitSvnCommitRef) @@ -333,27 +343,32 @@ class GitRepository(PathName, SubProcess): return None def applyPatch(self, patchFileName, stripDepth): - self.check_call((self.gitCmd, "apply", - ("-p" + str(stripDepth)), - "--whitespace=nowarn", - ("--directory=" + self.pathName), - patchFileName)) + self.checkCall((self.gitCmd, "apply", + ("-p" + str(stripDepth)), + "--whitespace=nowarn", + patchFileName)) def addAllToIndex(self): - self.check_call(self._cmdForPath() + ("add", "-A")) + self.checkCall((self.gitCmd, "add", "-A")) def deleteForced(self, fileName): - self.check_call(self._cmdForPath() + ("rm", "-f", fileName)) + self.checkCall((self.gitCmd, "rm", "-f", fileName)) - def commit(self, message, author, date): - self.check_call(self._cmdForPath() - + ("commit", - ("--message=" + message), - ("--author=" + author), - ("--date=" + date) )) + def commit(self, message, + authorName, authorEmail, authorDate, + committerName, committerEmail, committerDate): + author = ''.join((authorName, " <", authorEmail, ">")) + os.environ["GIT_COMMITTER_NAME"] = committerName # no need to save/restore earlier environment state. + os.environ["GIT_COMMITTER_EMAIL"] = committerEmail + os.environ["GIT_COMMITTER_DATE"] = committerDate + self.checkCall((self.gitCmd, "commit", + "--allow-empty", # only svn poperties changed. + ("--message=" + message), + ("--author=" + author), + ("--date=" + authorDate) )) def cleanDirsForced(self): - self.check_call(self._cmdForPath() + ("clean", "-fd")) + self.checkCall((self.gitCmd, "clean", "-fd")) @@ -417,6 +432,7 @@ def maintainTempGitSvnBranch(branchName, tempGitBranchName, if lastSvnRevision < diffBaseRevision: # unlikely, do nothing print gitRepo, gitRepo.getCurrentBranch(), "later than", svnWorkingCopy, ", nothing to update." + # CHECK: generate svn commits from the git commits? return print gitRepo, gitRepo.getCurrentBranch(), "earlier than", svnWorkingCopy @@ -440,36 +456,40 @@ def maintainTempGitSvnBranch(branchName, tempGitBranchName, assert gitRepo.getCurrentBranch() == tempGitBranchName - lenSvnWorkingCopyPathName = len(svnWorkingCopy.getPathName()) - patchStripDepth = len(svnWorkingCopy.getPathName().split(os.sep)) + patchStripDepth = 0 # patch generated at svn repo. maxNumLogEntries = maxCommits + 1 svnLogEntries = svnWorkingCopy.getLogEntries(diffBaseRevision, lastSvnRevision, maxNumLogEntries) - for (logEntryFrom, logEntryTo) in allSuccessivePairs(svnLogEntries): - print "" + numCommits = 0 + for (logEntryFrom, logEntryTo) in allSuccessivePairs(svnLogEntries): # create patch file from svn between the revisions: svnWorkingCopy.createPatchFile(logEntryFrom.revision, logEntryTo.revision, patchFileName) + patchedFileNames = svnWorkingCopy.patchedFileNames(patchFileName) - gitRepo.applyPatch(patchFileName, patchStripDepth) - print "Applied patch", patchFileName + if os.path.getsize(patchFileName) > 0: + gitRepo.applyPatch(patchFileName, patchStripDepth) + print "Applied patch", patchFileName + else: # only svn properties changed, do git commit for commit info only. + print "Empty patch", patchFileName gitRepo.addAllToIndex() # add all patch changes to the git index to be committed. # Applying the patch leaves files that have been actually deleted at zero size. # Therefore delete empty patched files from the git repo that do not exist in svn working copy: for patchedFileName in patchedFileNames: - versionControlledFileName = patchedFileName[lenSvnWorkingCopyPathName:] # includes leading slash - fileNameInGitRepo = gitRepo.getPathName() + versionControlledFileName + fileNameInGitRepo = os.path.join(gitRepo.getPathName(), patchedFileName) + fileNameInSvnWorkingCopy = os.path.join(svnWorkingCopy.getPathName(), patchedFileName) if os.path.isdir(fileNameInGitRepo): # print "Directory:", fileNameInGitRepo continue if not os.path.isfile(fileNameInGitRepo): - print "Already deleted:", fileNameInGitRepo + print "Possibly new binary file in svn, ignored here:", fileNameInGitRepo + # FIXME: Take a new binary file out of the svn repository directly. continue fileSize = os.path.getsize(fileNameInGitRepo) @@ -477,8 +497,9 @@ def maintainTempGitSvnBranch(branchName, tempGitBranchName, # print "Non empty file patched normally:", fileNameInGitRepo continue - # fileNameInGitRepo exists is empty - if os.path.isfile(patchedFileName): + # fileNameInGitRepo exists and is empty + if os.path.isfile(fileNameInSvnWorkingCopy): + # FIXME: this only works correctly when the svn working copy is hecked out at the target revision. print "Left empty file:", fileNameInGitRepo continue @@ -488,28 +509,35 @@ def maintainTempGitSvnBranch(branchName, tempGitBranchName, # commit, put toRevision at end so it can be picked up later. revisionsRange = svnWorkingCopy.revisionsRange(logEntryFrom.revision, logEntryTo.revision) message = logEntryTo.msg + "\n\n" + svnRemote + " diff -r " + revisionsRange + authorCommit = gitRepo.getLatestCommitForAuthor(logEntryTo.author) + authorName = gitRepo.getCommitAuthorName(authorCommit) + authorEmail = gitRepo.getCommitAuthorEmail(authorCommit) + # print "Author name and email:", authorName, authorEmail gitRepo.commit(message, - logEntryTo.author, # this normally matches a full earlier author entry that git will then use. - logEntryTo.date) + authorName, authorEmail, logEntryTo.date, + authorName, authorEmail, logEntryTo.date) # author is also git committer, just like git-svn - print "Commit author:", logEntryTo.author - print "Commit date:", logEntryTo.date + numCommits += 1 + + # print "Commit author:", logEntryTo.author + # print "Commit date:", logEntryTo.date print "Commit message:", logEntryTo.msg gitRepo.cleanDirsForced() # delete untracked directories and files if not gitRepo.workingDirectoryClean(): - errorExit(gitRepo, "on branch", gitRepo.getCurrentBranch(), "not clean") + errorExit(gitRepo, "on branch", gitRepo.getCurrentBranch(), "not clean, numCommits:", numCommits) + + print "Added", numCommits, "commit(s) to branch", tempGitBranchName if __name__ == "__main__": - import sys - testMode = False # when true, leave branch where it is, as if the last commits from upstream did not arrive defaultMaxCommits = 20 maxCommits = defaultMaxCommits + import sys argv = sys.argv[1:] while argv: if argv[0] == "test": @@ -519,7 +547,7 @@ if __name__ == "__main__": maxCommits = int(argv[0]) assert maxCommits >= 1 except: - errorExit("Argument(s) must be test and/or a maximum number of commits, defaults are false and " + defaultMaxCommits) + errorExit("Argument(s) should be test and/or a maximum number of commits, defaults are false and " + defaultMaxCommits) argv = argv[1:] repo = "lucene-solr" @@ -542,4 +570,3 @@ if __name__ == "__main__": patchFileName, maxCommits=maxCommits, testMode=testMode) -