zephyr/scripts/west_commands/zspdx/scanner.py
Steve Winslow fd31b9b4ac west: spdx: Generate SPDX 2.2 tag-value documents
This adds support to generate SPDX 2.2 tag-value documents via the
new west spdx command. The CMake file-based APIs are leveraged to
create relationships from source files to the corresponding
generated build files. SPDX-License-Identifier comments in source
files are scanned and filled into the SPDX documents.

Before `west build` is run, a specific file must be created in the
build directory so that the CMake API reply will run. This can be
done by running:

    west spdx --init -d BUILD_DIR

After `west build` is run, SPDX generation is then activated by
calling `west spdx`; currently this requires passing the build
directory as a parameter again:

    west spdx -d BUILD_DIR

This will generate three SPDX documents in `BUILD_DIR/spdx/`:

1) `app.spdx`: This contains the bill-of-materials for the
application source files used for the build.

2) `zephyr.spdx`: This contains the bill-of-materials for the
specific Zephyr source code files that are used for the build.

3) `build.spdx`: This contains the bill-of-materials for the built
output files.

Each file in the bill-of-materials is scanned, so that its hashes
(SHA256 and SHA1) can be recorded, along with any detected licenses
if an `SPDX-License-Identifier` appears in the file.

SPDX Relationships are created to indicate dependencies between
CMake build targets; build targets that are linked together; and
source files that are compiled to generate the built library files.

`west spdx` can be called with optional parameters for further
configuration:

* `-n PREFIX`: specifies a prefix for the Document Namespaces that
will be included in the generated SPDX documents. See SPDX spec 2.2
section 2.5 at
https://spdx.github.io/spdx-spec/2-document-creation-information/.
If -n is omitted, a default namespace will be generated according
to the default format described in section 2.5 using a random UUID.

* `-s SPDX_DIR`: specifies an alternate directory where the SPDX
documents should be written. If not specified, they will be saved
in `BUILD_DIR/spdx/`.

* `--analyze-includes`: in addition to recording the compiled
source code files (e.g. `.c`, `.S`) in the bills-of-materials, if
this flag is specified, `west spdx` will attempt to determine the
specific header files that are included for each `.c` file. This
will take longer, as it performs a dry run using the C compiler
for each `.c` file (using the same arguments that were passed to it
for the actual build).

* `--include-sdk`: if `--analyze-includes` is used, then adding
`--include-sdk` will create a fourth SPDX document, `sdk.spdx`,
which will list any header files included from the SDK.

Signed-off-by: Steve Winslow <steve@swinslow.net>
2021-05-05 11:14:06 -04:00

219 lines
6.7 KiB
Python

# Copyright (c) 2020, 2021 The Linux Foundation
#
# SPDX-License-Identifier: Apache-2.0
import hashlib
import os
import re
from west import log
from zspdx.licenses import LICENSES
from zspdx.util import getHashes
# ScannerConfig contains settings used to configure how the SPDX
# Document scanning should occur.
class ScannerConfig:
def __init__(self):
super(ScannerConfig, self).__init__()
# when assembling a Package's data, should we auto-conclude the
# Package's license, based on the licenses of its Files?
self.shouldConcludePackageLicense = True
# when assembling a Package's Files' data, should we auto-conclude
# each File's license, based on its detected license(s)?
self.shouldConcludeFileLicenses = True
# number of lines to scan for SPDX-License-Identifier (0 = all)
# defaults to 20
self.numLinesScanned = 20
# should we calculate SHA256 hashes for each Package's Files?
# note that SHA1 hashes are mandatory, per SPDX 2.2
self.doSHA256 = True
# should we calculate MD5 hashes for each Package's Files?
self.doMD5 = False
def parseLineForExpression(line):
"""Return parsed SPDX expression if tag found in line, or None otherwise."""
p = line.partition("SPDX-License-Identifier:")
if p[2] == "":
return None
# strip away trailing comment marks and whitespace, if any
expression = p[2].strip()
expression = expression.rstrip("/*")
expression = expression.strip()
return expression
def getExpressionData(filePath, numLines):
"""
Scans the specified file for the first SPDX-License-Identifier:
tag in the file.
Arguments:
- filePath: path to file to scan.
- numLines: number of lines to scan for an expression before
giving up. If 0, will scan the entire file.
Returns: parsed expression if found; None if not found.
"""
log.dbg(f" - getting licenses for {filePath}")
with open(filePath, "r") as f:
try:
lineno = 0
for line in f:
lineno += 1
if lineno > numLines > 0:
break
expression = parseLineForExpression(line)
if expression is not None:
return expression
except UnicodeDecodeError:
# invalid UTF-8 content
return None
# if we get here, we didn't find an expression
return None
def splitExpression(expression):
"""
Parse a license expression into its constituent identifiers.
Arguments:
- expression: SPDX license expression
Returns: array of split identifiers
"""
# remove parens and plus sign
e2 = re.sub(r'\(|\)|\+', "", expression, flags=re.IGNORECASE)
# remove word operators, ignoring case, leaving a blank space
e3 = re.sub(r' AND | OR | WITH ', " ", e2, flags=re.IGNORECASE)
# and split on space
e4 = e3.split(" ")
return sorted(e4)
def calculateVerificationCode(pkg):
"""
Calculate the SPDX Package Verification Code for all files in the package.
Arguments:
- pkg: Package
Returns: verification code as string
"""
hashes = []
for f in pkg.files.values():
hashes.append(f.sha1)
hashes.sort()
filelist = "".join(hashes)
hSHA1 = hashlib.sha1()
hSHA1.update(filelist.encode('utf-8'))
return hSHA1.hexdigest()
def checkLicenseValid(lic, doc):
"""
Check whether this license ID is a valid SPDX license ID, and add it
to the custom license IDs set for this Document if it isn't.
Arguments:
- lic: detected license ID
- doc: Document
"""
if lic not in LICENSES:
doc.customLicenseIDs.add(lic)
def getPackageLicenses(pkg):
"""
Extract lists of all concluded and infoInFile licenses seen.
Arguments:
- pkg: Package
Returns: sorted list of concluded license exprs,
sorted list of infoInFile ID's
"""
licsConcluded = set()
licsFromFiles = set()
for f in pkg.files.values():
licsConcluded.add(f.concludedLicense)
for licInfo in f.licenseInfoInFile:
licsFromFiles.add(licInfo)
return sorted(list(licsConcluded)), sorted(list(licsFromFiles))
def normalizeExpression(licsConcluded):
"""
Combine array of license expressions into one AND'd expression,
adding parens where needed.
Arguments:
- licsConcluded: array of license expressions
Returns: string with single AND'd expression.
"""
# return appropriate for simple cases
if len(licsConcluded) == 0:
return "NOASSERTION"
if len(licsConcluded) == 1:
return licsConcluded[0]
# more than one, so we'll need to combine them
# iff an expression has spaces, it needs parens
revised = []
for lic in licsConcluded:
if lic in ["NONE", "NOASSERTION"]:
continue
if " " in lic:
revised.append(f"({lic})")
else:
revised.append(lic)
return " AND ".join(revised)
def scanDocument(cfg, doc):
"""
Scan for licenses and calculate hashes for all Files and Packages
in this Document.
Arguments:
- cfg: ScannerConfig
- doc: Document
"""
for pkg in doc.pkgs.values():
log.inf(f"scanning files in package {pkg.cfg.name} in document {doc.cfg.name}")
# first, gather File data for this package
for f in pkg.files.values():
# set relpath based on package's relativeBaseDir
f.relpath = os.path.relpath(f.abspath, pkg.cfg.relativeBaseDir)
# get hashes for file
hashes = getHashes(f.abspath)
if not hashes:
log.wrn("unable to get hashes for file {f.abspath}; skipping")
continue
hSHA1, hSHA256, hMD5 = hashes
f.sha1 = hSHA1
if cfg.doSHA256:
f.sha256 = hSHA256
if cfg.doMD5:
f.md5 = hMD5
# get licenses for file
expression = getExpressionData(f.abspath, cfg.numLinesScanned)
if expression:
if cfg.shouldConcludeFileLicenses:
f.concludedLicense = expression
f.licenseInfoInFile = splitExpression(expression)
# check if any custom license IDs should be flagged for document
for lic in f.licenseInfoInFile:
checkLicenseValid(lic, doc)
# now, assemble the Package data
licsConcluded, licsFromFiles = getPackageLicenses(pkg)
if cfg.shouldConcludePackageLicense:
pkg.concludedLicense = normalizeExpression(licsConcluded)
pkg.licenseInfoFromFiles = licsFromFiles
pkg.verificationCode = calculateVerificationCode(pkg)