Skip to content

Commit

Permalink
Merge pull request #1 from nvnieuwk/refactor
Browse files Browse the repository at this point in the history
Refactor
  • Loading branch information
nvnieuwk authored Jan 22, 2024
2 parents 6cc4591 + efd6b02 commit bd628f1
Show file tree
Hide file tree
Showing 9 changed files with 186 additions and 256 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/go.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ name: Go

on:
push:
branches: [ "master" ]
branches: [ "main", "dev" ]
pull_request:
branches: [ "master" ]
branches: [ "main", "dev" ]

jobs:

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ svync --config <config.yaml> --input <input.vcf>
| --- | --- | --- |
| `--output`/`-o` | Path to the output VCF file | `stdout` |
| `--nodate`/`--nd` | Do not add the date to the output VCF file | `false` |
| `--notation`/`-n` | The notation to use for the output VCF file. Must be one of: breakpoint, breakend. | none |
| `--mute-warnings`/`--mw` | Do not output warnings | `false` |

## Configuration
The configuration file is the core of the standardization in Svync. More information can be found in the [configuration documentation](docs/configuration.md).
Expand Down
29 changes: 13 additions & 16 deletions svync.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@ package main
import (
"log"
"os"
"slices"
"strings"

"github.com/nvnieuwk/svync/svync_api"
cli "github.com/urfave/cli/v2"
Expand All @@ -29,23 +27,23 @@ func main() {
Usage: "The location to the output VCF file, defaults to stdout",
Category: "Optional",
},
&cli.StringFlag{
Name: "notation",
Aliases: []string{"n"},
Usage: "The notation to use for the output VCF file. Must be one of: breakpoint, breakend. By default the notation isn't changed",
// TODO re-add this when conversion is implemented
// &cli.BoolFlag{
// Name: "to-breakpoint",
// Aliases: []string{"tb"},
// Usage: "Convert pairs of breakends to a single breakpoint variant. WARNING: this will cause some loss of data.",
// Category: "Optional",
// },
&cli.BoolFlag{
Name: "mute-warnings",
Aliases: []string{"mw"},
Usage: "Mute all warnings.",
Category: "Optional",
Action: func(c *cli.Context, input string) error {
validNotations := []string{"breakpoint", "breakend"}
if slices.Contains(validNotations, input) {
return nil
}
return cli.Exit("Invalid notation '"+input+"', must be one of: "+strings.Join(validNotations, ", "), 1)
},
},
&cli.StringFlag{
Name: "config",
Aliases: []string{"c"},
Usage: "Configuration file (YAML) to use for the parsing of INFO and FORMAT fields",
Usage: "Configuration file (YAML) used for standardizing the VCF",
Required: true,
Category: "Required",
},
Expand All @@ -59,8 +57,7 @@ func main() {
},
Action: func(Cctx *cli.Context) error {
config := svync_api.ReadConfig(Cctx)
vcf := svync_api.ReadVcf(Cctx)
vcf.StandardizeAndOutput(config, Cctx) // Standardize VCF and write to output file
svync_api.Execute(Cctx, config)
return nil
},
}
Expand Down
196 changes: 124 additions & 72 deletions svync_api/read.go → svync_api/execute.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,60 +15,80 @@ import (
)

// Read the VCF file and return it as a VCF struct
func ReadVcf(Cctx *cli.Context) *VCF {
func Execute(Cctx *cli.Context, config *Config) {
logger := log.New(os.Stderr, "", 0)

file := Cctx.String("input")
openFile, err := os.Open(file)
defer openFile.Close()
inputVcf, err := os.Open(file)
defer inputVcf.Close()
if err != nil {
logger.Fatal(err)
}

vcf := newVCF()
if strings.HasSuffix(file, ".gz") {
vcf.readBgzip(openFile)
} else {
vcf.readPlain(openFile)
}

return vcf
}

// Initialize a new VCF
func newVCF() *VCF {
return &VCF{
Header: Header{
Info: map[string]HeaderLineIdNumberTypeDescription{},
Format: map[string]HeaderLineIdNumberTypeDescription{},
Alt: map[string]HeaderLineIdDescription{},
Filter: map[string]HeaderLineIdDescription{},
Contig: []HeaderLineIdLength{},
},
Variants: map[string]Variant{},
}
}

// Read the VCF file in bgzip format and convert it to a VCF struct
func (vcf *VCF) readBgzip(input *os.File) {
logger := log.New(os.Stderr, "", 0)

bgReader, err := bgzf.NewReader(input, 1)
if err != nil {
logger.Fatal(err)
header := newHeader()
breakEndVariants := &map[string]Variant{}
headerIsMade := false
variantCount := 0

stdout := true
var outputFile *os.File
if Cctx.String("output") != "" {
stdout = false
outputFile, err = os.Create(Cctx.String("output"))
if err != nil {
logger.Fatalf("Failed to create the output file: %v", err)
}
defer outputFile.Close()
}
defer bgReader.Close()

for {
b, _, err := readBgzipLine(bgReader)
if strings.HasSuffix(file, ".gz") {
bgReader, err := bgzf.NewReader(inputVcf, 1)
if err != nil {
if err == io.EOF {
break
logger.Fatal(err)
}
defer bgReader.Close()

for {
b, _, err := readBgzipLine(bgReader)
if err != nil {
if err == io.EOF {
break
}
logger.Fatal(string(b[:]))
}
logger.Fatal(string(b[:]))

parseLine(
string(bytes.TrimSpace(b[:])),
header,
breakEndVariants,
config,
Cctx,
&headerIsMade,
outputFile,
stdout,
&variantCount,
)
}
} else {
scanner := bufio.NewScanner(inputVcf)
const maxCapacity = 8 * 1000000 // 8 MB
scanner.Buffer(make([]byte, maxCapacity), maxCapacity)
for scanner.Scan() {
parseLine(
scanner.Text(),
header,
breakEndVariants,
config,
Cctx,
&headerIsMade,
outputFile,
stdout,
&variantCount,
)
}

vcf.parse(string(bytes.TrimSpace(b[:])))
if err := scanner.Err(); err != nil {
logger.Fatal(err)
}
}

}
Expand All @@ -95,44 +115,59 @@ func readBgzipLine(r *bgzf.Reader) ([]byte, bgzf.Chunk, error) {
return data, chunk, err
}

// Read the VCF file in plain text format and convert it to a VCF struct
func (vcf *VCF) readPlain(input *os.File) {
logger := log.New(os.Stderr, "", 0)

scanner := bufio.NewScanner(input)
const maxCapacity = 8 * 1000000 // 8 MB
scanner.Buffer(make([]byte, maxCapacity), maxCapacity)
for scanner.Scan() {
vcf.parse(scanner.Text())
}

if err := scanner.Err(); err != nil {
logger.Fatal(err)
// Parse the line and add it to the VCF struct
func parseLine(
line string,
header *Header,
breakEndVariants *map[string]Variant,
config *Config,
Cctx *cli.Context,
headerIsMade *bool,
outputFile *os.File,
stdout bool,
variantCount *int,
) {
if !strings.HasSuffix(line, "#") && !*headerIsMade {
writeHeader(config, Cctx, header, outputFile, stdout)
*headerIsMade = true
}

}

// Parse the line and add it to the VCF struct
func (vcf *VCF) parse(line string) {
if strings.HasPrefix(line, "#") {
vcf.Header.parse(line)
header.parse(line)
} else {
id := strings.Split(line, "\t")[2]
variant := &Variant{}
variant.Header = &vcf.Header
variant.parse(line)
vcf.Variants[id] = *variant
// logger.Println(vcf.Variants[id])
// id := strings.Split(line, "\t")[2]
variant := createVariant(line, header, Cctx)

// TODO continue work on this later
// Convert breakends to breakpoints if the --to-breakpoint flag is set
// if Cctx.Bool("to-breakpoint") && variant.Info["SVTYPE"][0] == "BND" && len(variant.Info["MATEID"]) == 1 {
// mateid := variant.Info["MATEID"][0]
// if mate, ok := (*breakEndVariants)[mateid]; ok {
// variant = toBreakPoint(variant, &mate)
// delete(*breakEndVariants, mateid)
// } else {
// (*breakEndVariants)[id] = *variant
// return
// }
// }
*variantCount++
standardizeAndOutput(config, Cctx, variant, outputFile, stdout, *variantCount)

// Standardize and output the variant
}
}

// Parse the line and add it to the Variant struct
func (variant *Variant) parse(line string) {
func createVariant(line string, header *Header, Cctx *cli.Context) *Variant {
logger := log.New(os.Stderr, "", 0)

err := error(nil)
variant := new(Variant)
variant.Header = header

data := strings.Split(line, "\t")
variant.Chromosome = data[0]

var err error
variant.Pos, err = strconv.ParseInt(data[1], 0, 64)
if err != nil {
logger.Fatal(err)
Expand All @@ -152,7 +187,7 @@ func (variant *Variant) parse(line string) {
if len(split) > 1 {
value = split[1]
}
variant.Info[field] = parseInfoFormat(field, value, variant.Header.Info)
variant.Info[field] = parseInfoFormat(field, value, variant.Header.Info, Cctx)
}

variant.Format = map[string]VariantFormat{}
Expand All @@ -166,18 +201,22 @@ func (variant *Variant) parse(line string) {
}
for idx, val := range strings.Split(value, ":") {
header := formatHeaders[idx]
variant.Format[sample].Content[header] = parseInfoFormat(header, val, variant.Header.Format)
variant.Format[sample].Content[header] = parseInfoFormat(header, val, variant.Header.Format, Cctx)
}
}

return variant

}

// Parse the value of the INFO or FORMAT field and return it as a slice of strings
func parseInfoFormat(header string, value string, infoFormatLines map[string]HeaderLineIdNumberTypeDescription) []string {
func parseInfoFormat(header string, value string, infoFormatLines map[string]HeaderLineIdNumberTypeDescription, Cctx *cli.Context) []string {
logger := log.New(os.Stderr, "", 0)
headerLine := infoFormatLines[header]
if headerLine == (HeaderLineIdNumberTypeDescription{}) {
logger.Printf("Field %s not found in header, defaulting to Type 'String' and Number '1'", header)
if !Cctx.Bool("mute-warnings") {
logger.Printf("Field %s not found in header, defaulting to Type 'String' and Number '1'", header)
}
headerLine = HeaderLineIdNumberTypeDescription{
Id: header,
Number: "1",
Expand Down Expand Up @@ -287,3 +326,16 @@ func convertLineToMap(line string) map[string]string {

return data
}

// Create a new header struct
func newHeader() *Header {
return &Header{
Info: map[string]HeaderLineIdNumberTypeDescription{},
Format: map[string]HeaderLineIdNumberTypeDescription{},
Alt: map[string]HeaderLineIdDescription{},
Filter: map[string]HeaderLineIdDescription{},
Contig: []HeaderLineIdLength{},
Other: []string{},
Samples: []string{},
}
}
2 changes: 1 addition & 1 deletion svync_api/functions.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ func resolveFunction(input string, token string) string {
case "len":
result += fmt.Sprint(len(value[0]))
default:
logger.Fatalf("The function '%s' is not supported", value[1:])
logger.Fatalf("The function '%s' is not supported", function)
}
return result
}
Expand Down
10 changes: 7 additions & 3 deletions svync_api/resolve.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@ import (
"regexp"
"strconv"
"strings"

cli "github.com/urfave/cli/v2"
)

// Resolve a value
func ResolveValue(input string, variant *Variant, format *VariantFormat) string {
func ResolveValue(input string, variant *Variant, format *VariantFormat, Cctx *cli.Context) string {
logger := log.New(os.Stderr, "", 0)

// Replace all the FORMAT fields
Expand All @@ -27,7 +29,9 @@ func ResolveValue(input string, variant *Variant, format *VariantFormat) string

// TODO implement some alternative way to handle missing fields
if !ok {
logger.Printf("The field %s is not present in the FORMAT fields of the variant with ID %s, excluding it from this variant", field, variant.Id)
if !Cctx.Bool("mute-warnings") {
logger.Printf("The field %s is not present in the FORMAT fields of the variant with ID %s, excluding it from this variant", field, variant.Id)
}
} else if len(fieldSlice) > 2 {
index, err := strconv.ParseInt(fieldSlice[2], 0, 64)
if err != nil {
Expand All @@ -50,7 +54,7 @@ func ResolveValue(input string, variant *Variant, format *VariantFormat) string
// TODO implement some alternative way to handle missing fields
if !ok {
infoType := variant.Header.Info[field].Type
if infoType != "Flag" {
if infoType != "Flag" && !Cctx.Bool("mute-warnings") {
logger.Printf("The field %s is not present in the INFO fields of the variant with ID %s, excluding it from this variant", field, variant.Id)
}
} else if len(fieldSlice) > 2 {
Expand Down
Loading

0 comments on commit bd628f1

Please sign in to comment.