lucene/gradle/generation/jflex.gradle

import java.nio.file.Files

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// Add a top-level pseudo-task to which we will attach individual regenerate tasks.

configure(rootProject) {
  configurations {
    jflex
  }

  dependencies {
    jflex deps.jflex
  }
}

def resources = scriptResources(buildscript)
def skeletonDefault = file("${resources}/skeleton.default.txt")
def skeletonNoBufferExpansion = file("${resources}/skeleton.disable.buffer.expansion.txt")

configure(project(":lucene:core")) {
  task generateStandardTokenizerInternal(type: JFlexTask) {
    description "Regenerate StandardTokenizerImpl.java"
    group "generation"

    jflexFile = file('src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex')
    skeleton = skeletonNoBufferExpansion

    doLast {
      ant.replace(
          file: file('src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java'),
          encoding: "UTF-8",
          token: "private static final int ZZ_BUFFERSIZE =",
          value: "private int ZZ_BUFFERSIZE ="
      )
    }
  }

  def generateStandardTokenizer = wrapWithPersistentChecksums(generateStandardTokenizerInternal, [
          andThenTasks: ["spotlessJava", "spotlessJavaApply"],
          mustRunBefore: [ "compileJava" ]
  ])

  regenerate.dependsOn generateStandardTokenizer
}

configure(project(":lucene:analysis:common")) {
  task generateTldsInternal() {
    def tldZones = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
    def jflexMacro = file("src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex")
    def tldList = file("src/test/org/apache/lucene/analysis/email/TLDs.txt")

    description "Regenerate top-level domain jflex macros and tests"
    group "generation"

    dependsOn { sourceSets.tools.runtimeClasspath }

    inputs.property "tldZones", tldZones
    outputs.files jflexMacro, tldList

    doFirst {
      File tmpJflexMacro = File.createTempFile(jflexMacro.getName(), ".tmp", getTemporaryDir())
      File tmpTldList = File.createTempFile(tldList.getName(), ".tmp", getTemporaryDir())

      project.javaexec {
        main = "org.apache.lucene.analysis.standard.GenerateJflexTLDMacros"
        classpath = sourceSets.tools.runtimeClasspath

        ignoreExitValue false
        args = [
            tldZones,
            tmpJflexMacro,
            tmpTldList
        ]
      }

      // LUCENE-9926: tldZones is regenerated daily. Compare the generated content (excluding comments) so that
      // we only update actual output files if non-comments have changed.
      def contentLines = { File file ->
        if (file.exists()) {
          List<String> lines = file.readLines("UTF-8")
          lines.removeIf { line -> line.isBlank() || line.startsWith("//") }
          return lines
        } else {
          return []
        }
      }

      if (contentLines(tmpTldList).equals(contentLines(tldList))) {
        logger.lifecycle("Generated TLD content identical as before, not updating.")
      } else {
        tldList.setBytes tmpTldList.bytes
        jflexMacro.setBytes tmpJflexMacro.bytes
        logger.lifecycle("You've regenerated the TLD include file, remember to regenerate UAX29URLEmailTokenizerImpl too.")
      }
    }
  }

  task generateWikipediaTokenizerInternal(type: JFlexTask) {
    description "Regenerate WikipediaTokenizerImpl.java"
    group "generation"

    jflexFile = file('src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex')
    skeleton = skeletonDefault
  }

  task generateClassicTokenizerInternal(type: JFlexTask) {
    description "Regenerate ClassicTokenizerImpl.java"
    group "generation"

    jflexFile = file('src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.jflex')
    skeleton = skeletonDefault
  }

  task generateUAX29URLEmailTokenizerInternal(type: JFlexTask) {
    description "Regenerate UAX29URLEmailTokenizerImpl.java"
    group "generation"

    jflexFile = file('src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex')
    skeleton = skeletonNoBufferExpansion
    heapSize = "12g"

    // Add included files as inputs.
    inputs.file file('src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex')

    doFirst {
      logger.lifecycle("Regenerating UAX29URLEmailTokenizerImpl. This may take a long time (and requires ${heapSize} of memory!).")
    }

    doLast {
      ant.replace(
          file: file('src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java'),
          encoding: "UTF-8",
          token: "private static final int ZZ_BUFFERSIZE =",
          value: "private int ZZ_BUFFERSIZE ="
      )
    }
  }

  def generateUAX29URLEmailTokenizer = wrapWithPersistentChecksums(generateUAX29URLEmailTokenizerInternal, [
          andThenTasks: ["spotlessJava", "spotlessJavaApply"],
          mustRunBefore: [ "compileJava" ]
  ])

  // UAX29URLEmailTokenizerImpl.jflex includes: ASCIITLD.jflex
  // so we make sure it is up to date. Also, make sure the ordering of internal
  // tasks is enforced (just scheduling top-level wrappers doesn't mean their subtask graphs cannot be reordered).
  configure (generateUAX29URLEmailTokenizer) {
    dependsOn "generateTlds"
  }
  configure (generateUAX29URLEmailTokenizerInternal) {
    mustRunAfter "generateTldsInternal"
  }

  task generateHTMLCharacterEntitiesInternal() {
    def target = file('src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex')
    def script = file("${resources}/htmlentity.py")

    outputs.files target
    inputs.file script

    doFirst {
      quietExec {
        executable = project.externalTool("python3")
        workingDir = target.parentFile
        args += [
            "-B", // don't write any bytecode cache
            script,
            target
        ]
      }

      project.ant.fixcrlf(
          file: target,
          encoding: "UTF-8",
          eol: "lf"
      )
    }
  }

  task generateHTMLStripCharFilterInternal(type: JFlexTask) {
    description "Regenerate HTMLStripCharFilter.java"
    group "generation"

    // Add included files as inputs.
    inputs.file file('src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex')

    jflexFile = file('src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex')
    skeleton = skeletonDefault
  }

  def extraConfig = [
      andThenTasks: ["spotlessJava", "spotlessJavaApply"],
      mustRunBefore: [ "compileJava" ]
  ]

  def generateHTMLStripCharFilter = wrapWithPersistentChecksums(generateHTMLStripCharFilterInternal, extraConfig)
  // HTMLStripCharFilter.jflex includes HTMLCharacterEntities.jflex so we make sure it's up to date.
  configure(generateHTMLStripCharFilter) {
    dependsOn "generateHTMLCharacterEntities"
  }
  configure(generateHTMLStripCharFilterInternal) {
    dependsOn "generateHTMLCharacterEntitiesInternal"
  }

  regenerate.dependsOn wrapWithPersistentChecksums(generateWikipediaTokenizerInternal, extraConfig),
      wrapWithPersistentChecksums(generateClassicTokenizerInternal, extraConfig),
      generateUAX29URLEmailTokenizer,
      wrapWithPersistentChecksums(generateHTMLCharacterEntitiesInternal, extraConfig),
      generateHTMLStripCharFilter,
      wrapWithPersistentChecksums(generateTldsInternal, extraConfig)
}

class JFlexTask extends DefaultTask {
  @InputFile
  File jflexFile

  @InputFile
  File skeleton

  @Internal
  String heapSize

  @OutputFile
  File getGeneratedFile() {
    return project.file(jflexFile.absolutePath.replace(".jflex", ".java"))
  }

  JFlexTask() {
    dependsOn(project.rootProject.configurations.jflex)
  }

  @TaskAction
  def generate() {
    if (!jflexFile || !jflexFile.exists()) {
      throw new GradleException("JFlex file does not exist: ${jflexFile}")
    }

    def target = project.file(jflexFile.absolutePath.replace(".jflex", ".java"))

    logger.lifecycle("Recompiling JFlex: ${project.rootDir.relativePath(jflexFile)}")

    project.javaexec {
      classpath {
        project.rootProject.configurations.jflex
      }

      main = "jflex.Main"
      args += [
          "-nobak",
          "--quiet",
          "--encoding", "UTF-8",
      ]

      if (heapSize) {
        maxHeapSize = heapSize
      }

      if (skeleton) {
        args += ["--skel", skeleton.absolutePath]
      }

      args += [
          "-d", target.parentFile.absolutePath,
          jflexFile
      ]
    }

    // fix invalid SuppressWarnings from jflex, so that it works with javac.
    // we also need to suppress 'unused' because of dead code, so it passes ecjLint
    // you can't "stack" SuppressWarnings annotations, jflex adds its own, this is the only way
    // https://github.com/jflex-de/jflex/issues/762
    project.ant.replace(
          file: target,
          encoding: "UTF-8",
          token: 'SuppressWarnings("FallThrough")',
          value: 'SuppressWarnings({"fallthrough","unused"})'
    )

    // Correct line endings for Windows.
    project.ant.fixcrlf(
        file: target,
        encoding: "UTF-8",
        eol: "lf"
    )
  }
}